Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Hardware acceleration configuration for ONNX Runtime models.
///
/// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
/// for inference in layout detection and embedding generation.
/// </summary>
public sealed record AccelerationConfig
{
/// <summary>
/// Execution provider to use for ONNX inference.
/// </summary>
[JsonPropertyName("provider")]
public ExecutionProviderType? Provider { get; init; } = null;
/// <summary>
/// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
/// </summary>
[JsonPropertyName("device_id")]
public uint DeviceId { get; init; } = 0;
/// <summary>
/// Parse a <see cref="AccelerationConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static AccelerationConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<AccelerationConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse AccelerationConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse AccelerationConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,205 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Types of inline text annotations.
/// </summary>
[JsonConverter(typeof(AnnotationKindJsonConverter))]
public abstract record AnnotationKind
{
public sealed record Bold() : AnnotationKind;
public sealed record Italic() : AnnotationKind;
public sealed record Underline() : AnnotationKind;
public sealed record Strikethrough() : AnnotationKind;
public sealed record Code() : AnnotationKind;
public sealed record Subscript() : AnnotationKind;
public sealed record Superscript() : AnnotationKind;
public sealed record Link(
[property: JsonPropertyName("url")] string Url,
[property: JsonPropertyName("title")] string? Title
) : AnnotationKind;
/// <summary>
/// Highlighted text (PDF highlights, HTML `&lt;mark&gt;`).
/// </summary>
public sealed record Highlight() : AnnotationKind;
/// <summary>
/// Text color (CSS-compatible value, e.g. "#ff0000", "red").
/// </summary>
public sealed record Color(
[property: JsonPropertyName("value")] string Value
) : AnnotationKind;
/// <summary>
/// Font size with units (e.g. "12pt", "1.2em", "16px").
/// </summary>
public sealed record FontSize(
[property: JsonPropertyName("value")] string Value
) : AnnotationKind;
/// <summary>
/// Extensible annotation for format-specific styling.
/// </summary>
public sealed record Custom(
[property: JsonPropertyName("name")] string Name,
[property: JsonPropertyName("value")] string? Value
) : AnnotationKind;
}
/// <summary>
/// Custom converter for AnnotationKind sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (annotation_type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class AnnotationKindJsonConverter : JsonConverter<AnnotationKind>
{
public override AnnotationKind Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("annotation_type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: annotation_type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "annotation_type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "annotation_type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "bold" => new AnnotationKind.Bold(), "italic" => new AnnotationKind.Italic(), "underline" => new AnnotationKind.Underline(), "strikethrough" => new AnnotationKind.Strikethrough(), "code" => new AnnotationKind.Code(), "subscript" => new AnnotationKind.Subscript(), "superscript" => new AnnotationKind.Superscript(), "link" => JsonSerializer.Deserialize<AnnotationKind.Link>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "highlight" => new AnnotationKind.Highlight(), "color" => JsonSerializer.Deserialize<AnnotationKind.Color>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "font_size" => JsonSerializer.Deserialize<AnnotationKind.FontSize>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "custom" => JsonSerializer.Deserialize<AnnotationKind.Custom>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown AnnotationKind discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, AnnotationKind value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"annotation_type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field annotation_type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case AnnotationKind.Bold _:
tag = "bold";
inner = null;
break; case AnnotationKind.Italic _:
tag = "italic";
inner = null;
break; case AnnotationKind.Underline _:
tag = "underline";
inner = null;
break; case AnnotationKind.Strikethrough _:
tag = "strikethrough";
inner = null;
break; case AnnotationKind.Code _:
tag = "code";
inner = null;
break; case AnnotationKind.Subscript _:
tag = "subscript";
inner = null;
break; case AnnotationKind.Superscript _:
tag = "superscript";
inner = null;
break; case AnnotationKind.Link v_link:
tag = "link"; inner = v_link; break; case AnnotationKind.Highlight _:
tag = "highlight";
inner = null;
break; case AnnotationKind.Color v_color:
tag = "color"; inner = v_color; break; case AnnotationKind.FontSize v_fontsize:
tag = "font_size"; inner = v_fontsize; break; case AnnotationKind.Custom v_custom:
tag = "custom"; inner = v_custom; break; default:
throw new JsonException($"Unknown AnnotationKind variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("annotation_type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,77 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single file extracted from an archive.
///
/// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
/// enabled, each processable file produces its own full `ExtractionResult`.
/// </summary>
public sealed record ArchiveEntry
{
/// <summary>
/// Archive-relative file path (e.g. "folder/document.pdf").
/// </summary>
[JsonPropertyName("path")]
public required string Path { get; init; }
/// <summary>
/// Detected MIME type of the file.
/// </summary>
[JsonPropertyName("mime_type")]
public required string MimeType { get; init; }
/// <summary>
/// Full extraction result for this file.
/// </summary>
[JsonPropertyName("result")]
public required ExtractionResult Result { get; init; }
/// <summary>
/// Parse a <see cref="ArchiveEntry"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ArchiveEntry FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ArchiveEntry>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ArchiveEntry from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ArchiveEntry from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,88 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Archive (ZIP/TAR/7Z) metadata.
///
/// Extracted from compressed archive files containing file lists and size information.
/// </summary>
public sealed record ArchiveMetadata
{
/// <summary>
/// Archive format ("ZIP", "TAR", "7Z", etc.)
/// </summary>
[JsonPropertyName("format")]
public string Format { get; init; } = "";
/// <summary>
/// Total number of files in the archive
/// </summary>
[JsonPropertyName("file_count")]
public uint FileCount { get; init; } = 0;
/// <summary>
/// List of file paths within the archive
/// </summary>
[JsonPropertyName("file_list")]
public List<string> FileList { get; init; } = [];
/// <summary>
/// Total uncompressed size in bytes
/// </summary>
[JsonPropertyName("total_size")]
public ulong TotalSize { get; init; } = 0;
/// <summary>
/// Compressed size in bytes (if available)
/// </summary>
[JsonPropertyName("compressed_size")]
public ulong? CompressedSize { get; init; } = null;
/// <summary>
/// Parse a <see cref="ArchiveMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ArchiveMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ArchiveMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ArchiveMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ArchiveMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

68
packages/csharp/src/Kreuzberg/BBox.cs generated Normal file
View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
/// </summary>
public sealed record BBox
{
[JsonPropertyName("x1")]
public float X1 { get; init; } = 0.0f;
[JsonPropertyName("y1")]
public float Y1 { get; init; } = 0.0f;
[JsonPropertyName("x2")]
public float X2 { get; init; } = 0.0f;
[JsonPropertyName("y2")]
public float Y2 { get; init; } = 0.0f;
/// <summary>
/// Parse a <see cref="BBox"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BBox FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BBox>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BBox from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BBox from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,78 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Batch item for byte array extraction.
///
/// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
/// to represent a single item in a batch extraction job.
/// </summary>
public sealed record BatchBytesItem
{
/// <summary>
/// The content bytes to extract from
/// </summary>
[JsonConverter(typeof(ByteArrayToIntArrayConverter))]
[JsonPropertyName("content")]
public byte[] Content { get; init; } = [];
/// <summary>
/// MIME type of the content (e.g., "application/pdf", "text/html")
/// </summary>
[JsonPropertyName("mime_type")]
public required string MimeType { get; init; }
/// <summary>
/// Per-item configuration overrides (null uses batch-level defaults)
/// </summary>
[JsonPropertyName("config")]
public FileExtractionConfig? Config { get; init; } = null;
/// <summary>
/// Parse a <see cref="BatchBytesItem"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BatchBytesItem FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BatchBytesItem>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BatchBytesItem from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BatchBytesItem from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Batch item for file extraction.
///
/// Used with `batch_extract_files` and `batch_extract_files_sync`
/// to represent a single file in a batch extraction job.
/// </summary>
public sealed record BatchFileItem
{
/// <summary>
/// Path to the file to extract from
/// </summary>
[JsonPropertyName("path")]
public required string Path { get; init; }
/// <summary>
/// Per-file configuration overrides (null uses batch-level defaults)
/// </summary>
[JsonPropertyName("config")]
public FileExtractionConfig? Config { get; init; } = null;
/// <summary>
/// Parse a <see cref="BatchFileItem"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BatchFileItem FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BatchFileItem>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BatchFileItem from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BatchFileItem from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// BibTeX bibliography metadata.
/// </summary>
public sealed record BibtexMetadata
{
/// <summary>
/// Number of entries in the bibliography.
/// </summary>
[JsonPropertyName("entry_count")]
public ulong EntryCount { get; init; } = 0;
[JsonPropertyName("citation_keys")]
public List<string> CitationKeys { get; init; } = [];
[JsonPropertyName("authors")]
public List<string> Authors { get; init; } = [];
[JsonPropertyName("year_range")]
public YearRange? YearRange { get; init; } = null;
[JsonPropertyName("entry_types")]
public Dictionary<string, ulong>? EntryTypes { get; init; } = null;
/// <summary>
/// Parse a <see cref="BibtexMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BibtexMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BibtexMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BibtexMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BibtexMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

105
packages/csharp/src/Kreuzberg/BlockType.cs generated Normal file
View File

@@ -0,0 +1,105 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Types of block-level elements in Djot.
/// </summary>
[JsonConverter(typeof(BlockTypeJsonConverter))]
public enum BlockType
{
[JsonPropertyName("paragraph")]
Paragraph,
[JsonPropertyName("heading")]
Heading,
[JsonPropertyName("blockquote")]
Blockquote,
[JsonPropertyName("code_block")]
CodeBlock,
[JsonPropertyName("list_item")]
ListItem,
[JsonPropertyName("ordered_list")]
OrderedList,
[JsonPropertyName("bullet_list")]
BulletList,
[JsonPropertyName("task_list")]
TaskList,
[JsonPropertyName("definition_list")]
DefinitionList,
[JsonPropertyName("definition_term")]
DefinitionTerm,
[JsonPropertyName("definition_description")]
DefinitionDescription,
[JsonPropertyName("div")]
Div,
[JsonPropertyName("section")]
Section,
[JsonPropertyName("thematic_break")]
ThematicBreak,
[JsonPropertyName("raw_block")]
RawBlock,
[JsonPropertyName("math_display")]
MathDisplay,
}
/// <summary>
/// Custom JSON converter for <see cref="BlockType"/> that respects explicit variant names.
/// </summary>
internal sealed class BlockTypeJsonConverter : JsonConverter<BlockType>
{
public override BlockType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"paragraph" => BlockType.Paragraph,
"heading" => BlockType.Heading,
"blockquote" => BlockType.Blockquote,
"code_block" => BlockType.CodeBlock,
"list_item" => BlockType.ListItem,
"ordered_list" => BlockType.OrderedList,
"bullet_list" => BlockType.BulletList,
"task_list" => BlockType.TaskList,
"definition_list" => BlockType.DefinitionList,
"definition_term" => BlockType.DefinitionTerm,
"definition_description" => BlockType.DefinitionDescription,
"div" => BlockType.Div,
"section" => BlockType.Section,
"thematic_break" => BlockType.ThematicBreak,
"raw_block" => BlockType.RawBlock,
"math_display" => BlockType.MathDisplay,
_ => throw new JsonException($"Unknown BlockType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, BlockType value, JsonSerializerOptions options)
{
var str = value switch
{
BlockType.Paragraph => "paragraph",
BlockType.Heading => "heading",
BlockType.Blockquote => "blockquote",
BlockType.CodeBlock => "code_block",
BlockType.ListItem => "list_item",
BlockType.OrderedList => "ordered_list",
BlockType.BulletList => "bullet_list",
BlockType.TaskList => "task_list",
BlockType.DefinitionList => "definition_list",
BlockType.DefinitionTerm => "definition_term",
BlockType.DefinitionDescription => "definition_description",
BlockType.Div => "div",
BlockType.Section => "section",
BlockType.ThematicBreak => "thematic_break",
BlockType.RawBlock => "raw_block",
BlockType.MathDisplay => "math_display",
_ => throw new JsonException($"Unknown BlockType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,80 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Bounding box coordinates for element positioning.
/// </summary>
public sealed record BoundingBox
{
/// <summary>
/// Left x-coordinate
/// </summary>
[JsonPropertyName("x0")]
public double X0 { get; init; } = 0.0;
/// <summary>
/// Bottom y-coordinate
/// </summary>
[JsonPropertyName("y0")]
public double Y0 { get; init; } = 0.0;
/// <summary>
/// Right x-coordinate
/// </summary>
[JsonPropertyName("x1")]
public double X1 { get; init; } = 0.0;
/// <summary>
/// Top y-coordinate
/// </summary>
[JsonPropertyName("y1")]
public double Y1 { get; init; } = 0.0;
/// <summary>
/// Parse a <see cref="BoundingBox"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BoundingBox FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BoundingBox>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BoundingBox from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BoundingBox from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Converts byte arrays to and from JSON integer arrays.
/// </summary>
/// <remarks>
/// System.Text.Json serializes byte[] as base64 strings by default, but Rust's serde
/// for Vec&lt;u8&gt; expects JSON arrays of integers [72, 101, 108, ...].
/// Apply this converter to byte[] fields that are serialized to FFI with
/// [JsonConverter(typeof(ByteArrayToIntArrayConverter))].
/// </remarks>
public sealed class ByteArrayToIntArrayConverter : JsonConverter<byte[]>
{
/// <summary>
/// Reads a JSON array of integers and converts it to a byte array.
/// </summary>
public override byte[]? Read(
ref Utf8JsonReader reader,
Type typeToConvert,
JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartArray)
{
throw new JsonException("Expected JSON array for byte[]");
}
var bytes = new List<byte>();
while (reader.Read())
{
if (reader.TokenType == JsonTokenType.EndArray)
{
break;
}
if (reader.TokenType == JsonTokenType.Number)
{
bytes.Add((byte)reader.GetInt32());
}
else
{
throw new JsonException($"Unexpected token type: {reader.TokenType}");
}
}
return bytes.ToArray();
}
/// <summary>
/// Writes a byte array as a JSON array of integers.
/// </summary>
public override void Write(
Utf8JsonWriter writer,
byte[] value,
JsonSerializerOptions options)
{
writer.WriteStartArray();
foreach (var b in value)
{
writer.WriteNumberValue(b);
}
writer.WriteEndArray();
}
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class CacheException : KreuzbergErrorException
{
public CacheException(string message) : base(message) { }
public CacheException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
public sealed record CacheStats
{
[JsonPropertyName("total_files")]
public ulong TotalFiles { get; init; } = 0;
[JsonPropertyName("total_size_mb")]
public double TotalSizeMb { get; init; } = 0.0;
[JsonPropertyName("available_space_mb")]
public double AvailableSpaceMb { get; init; } = 0.0;
[JsonPropertyName("oldest_file_age_days")]
public double OldestFileAgeDays { get; init; } = 0.0;
[JsonPropertyName("newest_file_age_days")]
public double NewestFileAgeDays { get; init; } = 0.0;
/// <summary>
/// Parse a <see cref="CacheStats"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CacheStats FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CacheStats>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CacheStats from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CacheStats from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class CancelledException : KreuzbergErrorException
{
public CancelledException(string message) : base(message) { }
public CancelledException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single changed cell within a table.
///
/// Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
/// reference it unconditionally, without requiring the `diff` Cargo feature.
/// `crate.diff` re-exports this type verbatim.
/// </summary>
public sealed record CellChange
{
/// <summary>
/// Zero-based row index.
/// </summary>
[JsonPropertyName("row")]
public ulong Row { get; init; } = 0;
/// <summary>
/// Zero-based column index.
/// </summary>
[JsonPropertyName("col")]
public ulong Col { get; init; } = 0;
/// <summary>
/// Value before the change.
/// </summary>
[JsonPropertyName("from")]
public required string From { get; init; }
/// <summary>
/// Value after the change.
/// </summary>
[JsonPropertyName("to")]
public required string To { get; init; }
/// <summary>
/// Parse a <see cref="CellChange"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CellChange FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CellChange>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CellChange from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CellChange from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

90
packages/csharp/src/Kreuzberg/Chunk.cs generated Normal file
View File

@@ -0,0 +1,90 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A text chunk with optional embedding and metadata.
///
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
/// contains the text content, optional embedding vector (if embedding generation
/// is configured), and metadata about its position in the document.
/// </summary>
public sealed record Chunk
{
/// <summary>
/// The text content of this chunk.
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// Semantic structural classification of this chunk.
///
/// Assigned by the heuristic classifier based on content patterns and
/// heading context. Defaults to `ChunkType.Unknown` when no rule matches.
/// </summary>
[JsonPropertyName("chunk_type")]
public ChunkType? ChunkType { get; init; } = null;
/// <summary>
/// Optional embedding vector for this chunk.
///
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
/// The dimensionality depends on the chosen embedding model.
/// </summary>
[JsonPropertyName("embedding")]
public List<float>? Embedding { get; init; } = null;
/// <summary>
/// Metadata about this chunk's position and properties.
/// </summary>
[JsonPropertyName("metadata")]
public required ChunkMetadata Metadata { get; init; }
/// <summary>
/// Parse a <see cref="Chunk"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static Chunk FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<Chunk>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse Chunk from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse Chunk from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,123 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Metadata about a chunk's position in the original document.
/// </summary>
public sealed record ChunkMetadata
{
/// <summary>
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
/// </summary>
[JsonPropertyName("byte_start")]
public ulong ByteStart { get; init; } = 0;
/// <summary>
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
/// </summary>
[JsonPropertyName("byte_end")]
public ulong ByteEnd { get; init; } = 0;
/// <summary>
/// Number of tokens in this chunk (if available).
///
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
/// </summary>
[JsonPropertyName("token_count")]
public ulong? TokenCount { get; init; } = null;
/// <summary>
/// Zero-based index of this chunk in the document.
/// </summary>
[JsonPropertyName("chunk_index")]
public ulong ChunkIndex { get; init; } = 0;
/// <summary>
/// Total number of chunks in the document.
/// </summary>
[JsonPropertyName("total_chunks")]
public ulong TotalChunks { get; init; } = 0;
/// <summary>
/// First page number this chunk spans (1-indexed).
///
/// Only populated when page tracking is enabled in extraction configuration.
/// </summary>
[JsonPropertyName("first_page")]
public uint? FirstPage { get; init; } = null;
/// <summary>
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
///
/// Only populated when page tracking is enabled in extraction configuration.
/// </summary>
[JsonPropertyName("last_page")]
public uint? LastPage { get; init; } = null;
/// <summary>
/// Heading context when using Markdown chunker.
///
/// Contains the heading hierarchy this chunk falls under.
/// Only populated when `ChunkerType.Markdown` is used.
/// </summary>
[JsonPropertyName("heading_context")]
public HeadingContext? HeadingContext { get; init; } = null;
/// <summary>
/// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
///
/// Contains zero-based indices into the top-level `images` collection for every
/// image whose `page_number` falls within `[first_page, last_page]`.
/// Empty when image extraction is disabled or the chunk spans no pages with images.
/// </summary>
[JsonPropertyName("image_indices")]
public List<uint> ImageIndices { get; init; } = [];
/// <summary>
/// Parse a <see cref="ChunkMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ChunkMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ChunkMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ChunkMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ChunkMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,155 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// How chunk size is measured.
///
/// Defaults to `Characters` (Unicode character count). When using token-based sizing,
/// chunks are sized by token count according to the specified tokenizer.
///
/// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
/// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
/// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
/// </summary>
[JsonConverter(typeof(ChunkSizingJsonConverter))]
public abstract record ChunkSizing
{
/// <summary>
/// Size measured in Unicode characters (default).
/// </summary>
public sealed record Characters() : ChunkSizing;
/// <summary>
/// Size measured in tokens from a HuggingFace tokenizer.
/// </summary>
public sealed record Tokenizer(
[property: JsonPropertyName("model")] string Model,
[property: JsonPropertyName("cache_dir")] string? CacheDir
) : ChunkSizing;
}
/// <summary>
/// Custom converter for ChunkSizing sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class ChunkSizingJsonConverter : JsonConverter<ChunkSizing>
{
public override ChunkSizing Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "characters" => new ChunkSizing.Characters(), "tokenizer" => JsonSerializer.Deserialize<ChunkSizing.Tokenizer>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown ChunkSizing discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, ChunkSizing value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case ChunkSizing.Characters _:
tag = "characters";
inner = null;
break; case ChunkSizing.Tokenizer v_tokenizer:
tag = "tokenizer"; inner = v_tokenizer; break; default:
throw new JsonException($"Unknown ChunkSizing variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

136
packages/csharp/src/Kreuzberg/ChunkType.cs generated Normal file
View File

@@ -0,0 +1,136 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Semantic structural classification of a text chunk.
///
/// Assigned by the heuristic classifier in `chunking.classifier`.
/// Defaults to `Unknown` when no rule matches.
/// Designed to be extended in future versions without breaking changes.
/// </summary>
[JsonConverter(typeof(ChunkTypeJsonConverter))]
public enum ChunkType
{
/// <summary>
/// Section heading or document title.
/// </summary>
[JsonPropertyName("heading")]
Heading,
/// <summary>
/// Party list: names, addresses, and signatories.
/// </summary>
[JsonPropertyName("party_list")]
PartyList,
/// <summary>
/// Definition clause ("X means…", "X shall mean…").
/// </summary>
[JsonPropertyName("definitions")]
Definitions,
/// <summary>
/// Operative clause containing legal/contractual action verbs.
/// </summary>
[JsonPropertyName("operative_clause")]
OperativeClause,
/// <summary>
/// Signature block with signatures, names, and dates.
/// </summary>
[JsonPropertyName("signature_block")]
SignatureBlock,
/// <summary>
/// Schedule, annex, appendix, or exhibit section.
/// </summary>
[JsonPropertyName("schedule")]
Schedule,
/// <summary>
/// Table-like content with aligned columns or repeated patterns.
/// </summary>
[JsonPropertyName("table_like")]
TableLike,
/// <summary>
/// Mathematical formula or equation.
/// </summary>
[JsonPropertyName("formula")]
Formula,
/// <summary>
/// Code block or preformatted content.
/// </summary>
[JsonPropertyName("code_block")]
CodeBlock,
/// <summary>
/// Embedded or referenced image content.
/// </summary>
[JsonPropertyName("image")]
Image,
/// <summary>
/// Organizational chart or hierarchy diagram.
/// </summary>
[JsonPropertyName("org_chart")]
OrgChart,
/// <summary>
/// Diagram, figure, or visual illustration.
/// </summary>
[JsonPropertyName("diagram")]
Diagram,
/// <summary>
/// Unclassified or mixed content.
/// </summary>
[JsonPropertyName("unknown")]
Unknown,
}
/// <summary>
/// Custom JSON converter for <see cref="ChunkType"/> that respects explicit variant names.
/// </summary>
internal sealed class ChunkTypeJsonConverter : JsonConverter<ChunkType>
{
public override ChunkType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"heading" => ChunkType.Heading,
"party_list" => ChunkType.PartyList,
"definitions" => ChunkType.Definitions,
"operative_clause" => ChunkType.OperativeClause,
"signature_block" => ChunkType.SignatureBlock,
"schedule" => ChunkType.Schedule,
"table_like" => ChunkType.TableLike,
"formula" => ChunkType.Formula,
"code_block" => ChunkType.CodeBlock,
"image" => ChunkType.Image,
"org_chart" => ChunkType.OrgChart,
"diagram" => ChunkType.Diagram,
"unknown" => ChunkType.Unknown,
_ => throw new JsonException($"Unknown ChunkType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ChunkType value, JsonSerializerOptions options)
{
var str = value switch
{
ChunkType.Heading => "heading",
ChunkType.PartyList => "party_list",
ChunkType.Definitions => "definitions",
ChunkType.OperativeClause => "operative_clause",
ChunkType.SignatureBlock => "signature_block",
ChunkType.Schedule => "schedule",
ChunkType.TableLike => "table_like",
ChunkType.Formula => "formula",
ChunkType.CodeBlock => "code_block",
ChunkType.Image => "image",
ChunkType.OrgChart => "org_chart",
ChunkType.Diagram => "diagram",
ChunkType.Unknown => "unknown",
_ => throw new JsonException($"Unknown ChunkType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,70 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Type of text chunker to use.
///
/// # Variants
///
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
/// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
/// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
/// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
/// lower = more splits). Without an embedding, falls back to a
/// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
/// blank-line paragraphs) and merges groups into chunks capped at
/// `max_characters` (default 1000). `topic_threshold` has no effect in the
/// fallback path. For best results, pair with an embedding model.
/// </summary>
[JsonConverter(typeof(ChunkerTypeJsonConverter))]
public enum ChunkerType
{
[JsonPropertyName("text")]
Text,
[JsonPropertyName("markdown")]
Markdown,
[JsonPropertyName("yaml")]
Yaml,
[JsonPropertyName("semantic")]
Semantic,
}
/// <summary>
/// Custom JSON converter for <see cref="ChunkerType"/> that respects explicit variant names.
/// </summary>
internal sealed class ChunkerTypeJsonConverter : JsonConverter<ChunkerType>
{
public override ChunkerType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"text" => ChunkerType.Text,
"markdown" => ChunkerType.Markdown,
"yaml" => ChunkerType.Yaml,
"semantic" => ChunkerType.Semantic,
_ => throw new JsonException($"Unknown ChunkerType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ChunkerType value, JsonSerializerOptions options)
{
var str = value switch
{
ChunkerType.Text => "text",
ChunkerType.Markdown => "markdown",
ChunkerType.Yaml => "yaml",
ChunkerType.Semantic => "semantic",
_ => throw new JsonException($"Unknown ChunkerType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,151 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Chunking configuration.
///
/// Configures text chunking for document content, including chunk size,
/// overlap, trimming behavior, and optional embeddings.
///
/// Use `..Default.default()` when constructing to allow for future field additions:
/// </summary>
public sealed record ChunkingConfig
{
/// <summary>
/// Maximum size per chunk (in units determined by `sizing`).
///
/// When `sizing` is `Characters` (default), this is the max character count.
/// When using token-based sizing, this is the max token count.
///
/// Default: 1000
/// </summary>
[JsonPropertyName("max_chars")]
public ulong MaxCharacters { get; init; } = 1000;
/// <summary>
/// Overlap between chunks (in units determined by `sizing`).
///
/// Default: 200
/// </summary>
[JsonPropertyName("max_overlap")]
public ulong Overlap { get; init; } = 200;
/// <summary>
/// Whether to trim whitespace from chunk boundaries.
///
/// Default: true
/// </summary>
[JsonPropertyName("trim")]
public bool Trim { get; init; } = true;
/// <summary>
/// Type of chunker to use (Text or Markdown).
///
/// Default: Text
/// </summary>
[JsonPropertyName("chunker_type")]
public ChunkerType ChunkerType { get; init; } = ChunkerType.Text;
/// <summary>
/// Optional embedding configuration for chunk embeddings.
/// </summary>
[JsonPropertyName("embedding")]
public EmbeddingConfig? Embedding { get; init; } = null;
/// <summary>
/// Use a preset configuration (overrides individual settings if provided).
/// </summary>
[JsonPropertyName("preset")]
public string? Preset { get; init; } = null;
/// <summary>
/// How to measure chunk size.
///
/// Default: `Characters` (Unicode character count).
/// Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
/// </summary>
[JsonPropertyName("sizing")]
public ChunkSizing? Sizing { get; init; } = null;
/// <summary>
/// When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
/// path (e.g. `"# Title &gt; ## Section\n\n"`) to each chunk's content string.
///
/// This is useful for RAG pipelines where each chunk needs self-contained
/// context about its position in the document structure.
///
/// Default: `false`
/// </summary>
[JsonPropertyName("prepend_heading_context")]
public bool PrependHeadingContext { get; init; } = false;
/// <summary>
/// Optional cosine similarity threshold for semantic topic boundary detection.
///
/// Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
/// provided. You almost never need to set this. When omitted, defaults to
/// `0.75` which works well for most documents. Lower values detect more
/// topic boundaries (more, smaller chunks); higher values detect fewer.
/// Range: `0.0..=1.0`.
/// </summary>
[JsonPropertyName("topic_threshold")]
public float? TopicThreshold { get; init; } = null;
/// <summary>
/// Parse a <see cref="ChunkingConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ChunkingConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ChunkingConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ChunkingConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ChunkingConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ChunkingConfig Default()
{
var nativeResult = NativeMethods.ChunkingConfigDefault();
var jsonPtr = NativeMethods.ChunkingConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ChunkingConfigFree(nativeResult);
return JsonSerializer.Deserialize<ChunkingConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Citation file metadata (RIS, PubMed, EndNote).
/// </summary>
public sealed record CitationMetadata
{
[JsonPropertyName("citation_count")]
public ulong CitationCount { get; init; } = 0;
[JsonPropertyName("format")]
public string? Format { get; init; } = null;
[JsonPropertyName("authors")]
public List<string> Authors { get; init; } = [];
[JsonPropertyName("year_range")]
public YearRange? YearRange { get; init; } = null;
[JsonPropertyName("dois")]
public List<string> Dois { get; init; } = [];
[JsonPropertyName("keywords")]
public List<string> Keywords { get; init; } = [];
/// <summary>
/// Parse a <see cref="CitationMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CitationMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CitationMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CitationMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CitationMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Content rendering mode for code extraction.
///
/// Controls how extracted code content is represented in the `content` field
/// of `ExtractionResult`.
/// </summary>
[JsonConverter(typeof(CodeContentModeJsonConverter))]
public enum CodeContentMode
{
/// <summary>
/// Use TSLP semantic chunks as content (default).
/// </summary>
[JsonPropertyName("chunks")]
Chunks,
/// <summary>
/// Use raw source code as content.
/// </summary>
[JsonPropertyName("raw")]
Raw,
/// <summary>
/// Emit function/class headings + docstrings (no code bodies).
/// </summary>
[JsonPropertyName("structure")]
Structure,
}
/// <summary>
/// Custom JSON converter for <see cref="CodeContentMode"/> that respects explicit variant names.
/// </summary>
internal sealed class CodeContentModeJsonConverter : JsonConverter<CodeContentMode>
{
public override CodeContentMode Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"chunks" => CodeContentMode.Chunks,
"raw" => CodeContentMode.Raw,
"structure" => CodeContentMode.Structure,
_ => throw new JsonException($"Unknown CodeContentMode value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, CodeContentMode value, JsonSerializerOptions options)
{
var str = value switch
{
CodeContentMode.Chunks => "chunks",
CodeContentMode.Raw => "raw",
CodeContentMode.Structure => "structure",
_ => throw new JsonException($"Unknown CodeContentMode value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,132 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Cross-extractor content filtering configuration.
///
/// Controls whether "furniture" content (headers, footers, page numbers,
/// watermarks, repeating text) is included in or stripped from extraction
/// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
/// with format-specific implementation.
///
/// When `None` on `ExtractionConfig`, each extractor uses its current
/// default behavior unchanged.
/// </summary>
public sealed record ContentFilterConfig
{
/// <summary>
/// Include running headers in extraction output.
///
/// - PDF: Disables top-margin furniture stripping and prevents the layout
/// model from treating `PageHeader`-classified regions as furniture.
/// - DOCX: Includes document headers in text output.
/// - RTF/ODT: Headers already included; this is a no-op when true.
/// - HTML/EPUB: Keeps `&lt;header&gt;` element content.
///
/// Default: `false` (headers are stripped or excluded).
/// </summary>
[JsonPropertyName("include_headers")]
public bool IncludeHeaders { get; init; } = false;
/// <summary>
/// Include running footers in extraction output.
///
/// - PDF: Disables bottom-margin furniture stripping and prevents the layout
/// model from treating `PageFooter`-classified regions as furniture.
/// - DOCX: Includes document footers in text output.
/// - RTF/ODT: Footers already included; this is a no-op when true.
/// - HTML/EPUB: Keeps `&lt;footer&gt;` element content.
///
/// Default: `false` (footers are stripped or excluded).
/// </summary>
[JsonPropertyName("include_footers")]
public bool IncludeFooters { get; init; } = false;
/// <summary>
/// Enable the heuristic cross-page repeating text detector.
///
/// When `true` (default), text that repeats verbatim across a supermajority
/// of pages is classified as furniture and stripped. Disable this if brand
/// names or repeated headings are being incorrectly removed by the heuristic.
///
/// Note: when a layout-detection model is active, the model may independently
/// classify page-header / page-footer regions as furniture on a per-page basis.
/// To preserve those regions, set `include_headers = true`, `include_footers = true`,
/// or both, in addition to disabling this flag.
///
/// Primarily affects PDF extraction.
///
/// Default: `true`.
/// </summary>
[JsonPropertyName("strip_repeating_text")]
public bool StripRepeatingText { get; init; } = true;
/// <summary>
/// Include watermark text in extraction output.
///
/// - PDF: Keeps watermark artifacts and arXiv identifiers.
/// - Other formats: No effect currently.
///
/// Default: `false` (watermarks are stripped).
/// </summary>
[JsonPropertyName("include_watermarks")]
public bool IncludeWatermarks { get; init; } = false;
/// <summary>
/// Parse a <see cref="ContentFilterConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ContentFilterConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ContentFilterConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ContentFilterConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ContentFilterConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ContentFilterConfig Default()
{
var nativeResult = NativeMethods.ContentFilterConfigDefault();
var jsonPtr = NativeMethods.ContentFilterConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ContentFilterConfigFree(nativeResult);
return JsonSerializer.Deserialize<ContentFilterConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Content layer classification for document nodes.
///
/// Replaces separate body/furniture arrays with per-node granularity.
/// </summary>
[JsonConverter(typeof(ContentLayerJsonConverter))]
public enum ContentLayer
{
/// <summary>
/// Main document body content.
/// </summary>
[JsonPropertyName("body")]
Body,
/// <summary>
/// Page/section header (running header).
/// </summary>
[JsonPropertyName("header")]
Header,
/// <summary>
/// Page/section footer (running footer).
/// </summary>
[JsonPropertyName("footer")]
Footer,
/// <summary>
/// Footnote content.
/// </summary>
[JsonPropertyName("footnote")]
Footnote,
}
/// <summary>
/// Custom JSON converter for <see cref="ContentLayer"/> that respects explicit variant names.
/// </summary>
internal sealed class ContentLayerJsonConverter : JsonConverter<ContentLayer>
{
public override ContentLayer Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"body" => ContentLayer.Body,
"header" => ContentLayer.Header,
"footer" => ContentLayer.Footer,
"footnote" => ContentLayer.Footnote,
_ => throw new JsonException($"Unknown ContentLayer value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ContentLayer value, JsonSerializerOptions options)
{
var str = value switch
{
ContentLayer.Body => "body",
ContentLayer.Header => "header",
ContentLayer.Footer => "footer",
ContentLayer.Footnote => "footnote",
_ => throw new JsonException($"Unknown ContentLayer value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// JATS contributor with role.
/// </summary>
public sealed record ContributorRole
{
[JsonPropertyName("name")]
public required string Name { get; init; }
[JsonPropertyName("role")]
public string? Role { get; init; } = null;
/// <summary>
/// Parse a <see cref="ContributorRole"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ContributorRole FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ContributorRole>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ContributorRole from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ContributorRole from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,149 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Dublin Core metadata from docProps/core.xml
///
/// Contains standard metadata fields defined by the Dublin Core standard
/// and Office-specific extensions.
/// </summary>
public sealed record CoreProperties
{
/// <summary>
/// Document title
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Document subject/topic
/// </summary>
[JsonPropertyName("subject")]
public string? Subject { get; init; } = null;
/// <summary>
/// Document creator/author
/// </summary>
[JsonPropertyName("creator")]
public string? Creator { get; init; } = null;
/// <summary>
/// Keywords or tags
/// </summary>
[JsonPropertyName("keywords")]
public string? Keywords { get; init; } = null;
/// <summary>
/// Document description/abstract
/// </summary>
[JsonPropertyName("description")]
public string? Description { get; init; } = null;
/// <summary>
/// User who last modified the document
/// </summary>
[JsonPropertyName("last_modified_by")]
public string? LastModifiedBy { get; init; } = null;
/// <summary>
/// Revision number
/// </summary>
[JsonPropertyName("revision")]
public string? Revision { get; init; } = null;
/// <summary>
/// Creation timestamp (ISO 8601)
/// </summary>
[JsonPropertyName("created")]
public string? Created { get; init; } = null;
/// <summary>
/// Last modification timestamp (ISO 8601)
/// </summary>
[JsonPropertyName("modified")]
public string? Modified { get; init; } = null;
/// <summary>
/// Document category
/// </summary>
[JsonPropertyName("category")]
public string? Category { get; init; } = null;
/// <summary>
/// Content status (Draft, Final, etc.)
/// </summary>
[JsonPropertyName("content_status")]
public string? ContentStatus { get; init; } = null;
/// <summary>
/// Document language
/// </summary>
[JsonPropertyName("language")]
public string? Language { get; init; } = null;
/// <summary>
/// Unique identifier
/// </summary>
[JsonPropertyName("identifier")]
public string? Identifier { get; init; } = null;
/// <summary>
/// Document version
/// </summary>
[JsonPropertyName("version")]
public string? Version { get; init; } = null;
/// <summary>
/// Last print timestamp (ISO 8601)
/// </summary>
[JsonPropertyName("last_printed")]
public string? LastPrinted { get; init; } = null;
/// <summary>
/// Parse a <see cref="CoreProperties"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CoreProperties FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CoreProperties>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CoreProperties from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CoreProperties from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// CSV/TSV file metadata.
/// </summary>
public sealed record CsvMetadata
{
[JsonPropertyName("row_count")]
public uint RowCount { get; init; } = 0;
[JsonPropertyName("column_count")]
public uint ColumnCount { get; init; } = 0;
[JsonPropertyName("delimiter")]
public string? Delimiter { get; init; } = null;
[JsonPropertyName("has_header")]
public bool HasHeader { get; init; } = false;
[JsonPropertyName("column_types")]
public List<string>? ColumnTypes { get; init; } = null;
/// <summary>
/// Parse a <see cref="CsvMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CsvMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CsvMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CsvMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CsvMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// dBASE field information.
/// </summary>
public sealed record DbfFieldInfo
{
[JsonPropertyName("name")]
public required string Name { get; init; }
[JsonPropertyName("field_type")]
public required string FieldType { get; init; }
/// <summary>
/// Parse a <see cref="DbfFieldInfo"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DbfFieldInfo FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DbfFieldInfo>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DbfFieldInfo from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DbfFieldInfo from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// dBASE (DBF) file metadata.
/// </summary>
public sealed record DbfMetadata
{
[JsonPropertyName("record_count")]
public ulong RecordCount { get; init; } = 0;
[JsonPropertyName("field_count")]
public ulong FieldCount { get; init; } = 0;
[JsonPropertyName("fields")]
public List<DbfFieldInfo> Fields { get; init; } = [];
/// <summary>
/// Parse a <see cref="DbfMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DbfMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DbfMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DbfMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DbfMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// MIME type detection response.
/// </summary>
public sealed record DetectResponse
{
/// <summary>
/// Detected MIME type
/// </summary>
[JsonPropertyName("mime_type")]
public required string MimeType { get; init; }
/// <summary>
/// Original filename (if provided)
/// </summary>
[JsonPropertyName("filename")]
public string? Filename { get; init; } = null;
/// <summary>
/// Parse a <see cref="DetectResponse"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DetectResponse FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DetectResponse>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DetectResponse from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DetectResponse from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Page-level detection result containing all detections and page metadata.
/// </summary>
public sealed record DetectionResult
{
[JsonPropertyName("page_width")]
public uint PageWidth { get; init; } = 0;
[JsonPropertyName("page_height")]
public uint PageHeight { get; init; } = 0;
[JsonPropertyName("detections")]
public List<LayoutDetection> Detections { get; init; } = [];
/// <summary>
/// Parse a <see cref="DetectionResult"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DetectionResult FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DetectionResult>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DetectionResult from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DetectionResult from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single contiguous hunk in a unified diff.
/// </summary>
public sealed record DiffHunk
{
/// <summary>
/// Starting line number in the old content (0-indexed).
/// </summary>
[JsonPropertyName("from_line")]
public ulong FromLine { get; init; } = 0;
/// <summary>
/// Number of lines from the old content in this hunk.
/// </summary>
[JsonPropertyName("from_count")]
public ulong FromCount { get; init; } = 0;
/// <summary>
/// Starting line number in the new content (0-indexed).
/// </summary>
[JsonPropertyName("to_line")]
public ulong ToLine { get; init; } = 0;
/// <summary>
/// Number of lines from the new content in this hunk.
/// </summary>
[JsonPropertyName("to_count")]
public ulong ToCount { get; init; } = 0;
/// <summary>
/// Lines that make up this hunk.
/// </summary>
[JsonPropertyName("lines")]
public List<DiffLine> Lines { get; init; } = [];
/// <summary>
/// Parse a <see cref="DiffHunk"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DiffHunk FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DiffHunk>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DiffHunk from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DiffHunk from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

168
packages/csharp/src/Kreuzberg/DiffLine.cs generated Normal file
View File

@@ -0,0 +1,168 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single line in a unified-diff hunk.
///
/// Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
/// reference it unconditionally, without requiring the `diff` Cargo feature.
/// `crate.diff` re-exports this type verbatim.
/// </summary>
[JsonConverter(typeof(DiffLineJsonConverter))]
public abstract record DiffLine
{
/// <summary>
/// Unchanged context line.
/// </summary>
public sealed record Context(
string Value
) : DiffLine;
/// <summary>
/// Line added in the "after" version.
/// </summary>
public sealed record Added(
string Value
) : DiffLine;
/// <summary>
/// Line removed from the "before" version.
/// </summary>
public sealed record Removed(
string Value
) : DiffLine;
/// <summary>Returns the Context data if this is a Context variant, otherwise null.</summary>
public string? AsContext => this is Context e ? e.Value : null;
/// <summary>Returns the Added data if this is a Added variant, otherwise null.</summary>
public string? AsAdded => this is Added e ? e.Value : null;
/// <summary>Returns the Removed data if this is a Removed variant, otherwise null.</summary>
public string? AsRemoved => this is Removed e ? e.Value : null;
}
/// <summary>
/// Custom converter for DiffLine sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (kind) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class DiffLineJsonConverter : JsonConverter<DiffLine>
{
public override DiffLine Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("kind", out var tagElement))
{
throw new JsonException($"Missing discriminator field: kind");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "kind")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "kind")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "context" => JsonSerializer.Deserialize<DiffLine.Context>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "added" => JsonSerializer.Deserialize<DiffLine.Added>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "removed" => JsonSerializer.Deserialize<DiffLine.Removed>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown DiffLine discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, DiffLine value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"kind":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field kind" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case DiffLine.Context v_context:
tag = "context"; inner = v_context.Value; break; case DiffLine.Added v_added:
tag = "added"; inner = v_added.Value; break; case DiffLine.Removed v_removed:
tag = "removed"; inner = v_removed.Value; break; default:
throw new JsonException($"Unknown DiffLine variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("kind", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,87 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Options controlling how two `ExtractionResult` values are compared.
/// </summary>
public sealed record DiffOptions
{
/// <summary>
/// Include metadata changes in the diff. Default: `true`.
/// </summary>
[JsonPropertyName("include_metadata")]
public bool IncludeMetadata { get; init; } = true;
/// <summary>
/// Include embedded-children changes in the diff. Default: `true`.
/// </summary>
[JsonPropertyName("include_embedded")]
public bool IncludeEmbedded { get; init; } = true;
/// <summary>
/// Truncate content to this many characters before diffing.
///
/// Useful for very large documents where only the first N characters matter.
/// `None` means no truncation.
/// </summary>
[JsonPropertyName("max_content_chars")]
public ulong? MaxContentChars { get; init; } = null;
/// <summary>
/// Parse a <see cref="DiffOptions"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DiffOptions FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DiffOptions>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DiffOptions from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DiffOptions from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static DiffOptions Default()
{
var nativeResult = NativeMethods.DiffOptionsDefault();
var jsonPtr = NativeMethods.DiffOptionsToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.DiffOptionsFree(nativeResult);
return JsonSerializer.Deserialize<DiffOptions>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,114 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Comprehensive Djot document structure with semantic preservation.
///
/// This type captures the full richness of Djot markup, including:
/// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
/// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
/// - Attributes (classes, IDs, key-value pairs)
/// - Links, images, footnotes
/// - Math expressions (inline and display)
/// - Tables with full structure
///
/// Available when the `djot` feature is enabled.
/// </summary>
public sealed record DjotContent
{
/// <summary>
/// Plain text representation for backwards compatibility
/// </summary>
[JsonPropertyName("plain_text")]
public required string PlainText { get; init; }
/// <summary>
/// Structured block-level content
/// </summary>
[JsonPropertyName("blocks")]
public List<FormattedBlock> Blocks { get; init; } = [];
/// <summary>
/// Metadata from YAML frontmatter
/// </summary>
[JsonPropertyName("metadata")]
public required Metadata Metadata { get; init; }
/// <summary>
/// Extracted tables as structured data
/// </summary>
[JsonPropertyName("tables")]
public List<Table> Tables { get; init; } = [];
/// <summary>
/// Extracted images with metadata
/// </summary>
[JsonPropertyName("images")]
public List<DjotImage> Images { get; init; } = [];
/// <summary>
/// Extracted links with URLs
/// </summary>
[JsonPropertyName("links")]
public List<DjotLink> Links { get; init; } = [];
/// <summary>
/// Footnote definitions
/// </summary>
[JsonPropertyName("footnotes")]
public List<Footnote> Footnotes { get; init; } = [];
/// <summary>
/// Attributes mapped by element identifier (if present)
/// </summary>
[JsonPropertyName("attributes")]
public List<string>? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DjotContent"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DjotContent FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DjotContent>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DjotContent from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DjotContent from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,80 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image element in Djot.
/// </summary>
public sealed record DjotImage
{
/// <summary>
/// Image source URL or path
/// </summary>
[JsonPropertyName("src")]
public required string Src { get; init; }
/// <summary>
/// Alternative text
/// </summary>
[JsonPropertyName("alt")]
public required string Alt { get; init; }
/// <summary>
/// Optional title
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Element attributes
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DjotImage"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DjotImage FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DjotImage>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DjotImage from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DjotImage from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,80 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Link element in Djot.
/// </summary>
public sealed record DjotLink
{
/// <summary>
/// Link URL
/// </summary>
[JsonPropertyName("url")]
public required string Url { get; init; }
/// <summary>
/// Link text content
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// Optional title
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Element attributes
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DjotLink"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DjotLink FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DjotLink>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DjotLink from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DjotLink from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,124 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single node in the document tree.
///
/// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
/// for tree structure, and metadata like page number, bounding box, and content layer.
/// </summary>
public sealed record DocumentNode
{
/// <summary>
/// Deterministic identifier (hash of content + position).
/// </summary>
[JsonPropertyName("id")]
public required string Id { get; init; }
/// <summary>
/// Node content — tagged enum, type-specific data only.
/// </summary>
[JsonPropertyName("content")]
public required NodeContent Content { get; init; }
/// <summary>
/// Parent node index (`None` = root-level node).
/// </summary>
[JsonPropertyName("parent")]
public uint? Parent { get; init; } = null;
/// <summary>
/// Child node indices in reading order.
/// </summary>
[JsonPropertyName("children")]
public List<uint> Children { get; init; } = [];
/// <summary>
/// Content layer classification.
/// </summary>
[JsonPropertyName("content_layer")]
public ContentLayer? ContentLayer { get; init; } = null;
/// <summary>
/// Page number where this node starts (1-indexed).
/// </summary>
[JsonPropertyName("page")]
public uint? Page { get; init; } = null;
/// <summary>
/// Page number where this node ends (for multi-page tables/sections).
/// </summary>
[JsonPropertyName("page_end")]
public uint? PageEnd { get; init; } = null;
/// <summary>
/// Bounding box in document coordinates.
/// </summary>
[JsonPropertyName("bbox")]
public BoundingBox? Bbox { get; init; } = null;
/// <summary>
/// Inline annotations (formatting, links) on this node's text content.
///
/// Only meaningful for text-carrying nodes; empty for containers.
/// </summary>
[JsonPropertyName("annotations")]
public List<TextAnnotation> Annotations { get; init; } = [];
/// <summary>
/// Format-specific key-value attributes.
///
/// Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
/// LaTeX environment names, Excel cell formulas, slide layout names, etc.
/// </summary>
[JsonPropertyName("attributes")]
public Dictionary<string, string>? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DocumentNode"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentNode FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentNode>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentNode from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentNode from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A resolved relationship between two nodes in the document tree.
/// </summary>
public sealed record DocumentRelationship
{
/// <summary>
/// Source node index (the referencing node).
/// </summary>
[JsonPropertyName("source")]
public uint Source { get; init; } = 0;
/// <summary>
/// Target node index (the referenced node).
/// </summary>
[JsonPropertyName("target")]
public uint Target { get; init; } = 0;
/// <summary>
/// Semantic kind of the relationship.
/// </summary>
[JsonPropertyName("kind")]
public required RelationshipKind Kind { get; init; }
/// <summary>
/// Parse a <see cref="DocumentRelationship"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentRelationship FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentRelationship>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentRelationship from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentRelationship from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,110 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single tracked change embedded in a document.
///
/// Populated by per-format extractors that understand change-tracking metadata
/// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
/// extractor defaults to `ExtractionResult.revisions = None` until a
/// format-specific implementation is added.
/// </summary>
public sealed record DocumentRevision
{
/// <summary>
/// Format-specific revision identifier.
///
/// For DOCX this is the `w:id` attribute value on the change element
/// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
/// generated (`"docx-ins-0"`, `"docx-del-3"`, …).
/// </summary>
[JsonPropertyName("revision_id")]
public required string RevisionId { get; init; }
/// <summary>
/// Display name of the author who made this change, when available.
/// </summary>
[JsonPropertyName("author")]
public string? Author { get; init; } = null;
/// <summary>
/// ISO-8601 timestamp of the change, when available.
///
/// Stored as a plain string so this type remains FFI-friendly and
/// unconditionally available without the `chrono` optional dep.
/// DOCX populates this from the `w:date` attribute (e.g.
/// `"2024-03-15T10:30:00Z"`).
/// </summary>
[JsonPropertyName("timestamp")]
public string? Timestamp { get; init; } = null;
/// <summary>
/// Semantic kind of this revision.
/// </summary>
[JsonPropertyName("kind")]
public required RevisionKind Kind { get; init; }
/// <summary>
/// Best-effort document location for this revision.
///
/// Resolution is format-dependent and may be `None` when the location
/// cannot be determined (e.g. changes inside table cells before
/// table-cell anchor support is added).
/// </summary>
[JsonPropertyName("anchor")]
public RevisionAnchor? Anchor { get; init; } = null;
/// <summary>
/// The content changes that make up this revision.
/// </summary>
[JsonPropertyName("delta")]
public required RevisionDelta Delta { get; init; }
/// <summary>
/// Parse a <see cref="DocumentRevision"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentRevision FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentRevision>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentRevision from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentRevision from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,112 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Top-level structured document representation.
///
/// A flat array of nodes with index-based parent/child references forming a tree.
/// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
/// to iterate over top-level content by layer.
///
/// # Validation
///
/// Call `validate()` after construction to verify all node indices are in bounds
/// and parent-child relationships are bidirectionally consistent.
/// </summary>
public sealed record DocumentStructure
{
/// <summary>
/// All nodes in document/reading order.
/// </summary>
[JsonPropertyName("nodes")]
public List<DocumentNode> Nodes { get; init; } = [];
/// <summary>
/// Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
///
/// Allows renderers to apply format-aware heuristics when converting
/// the document tree to output formats.
/// </summary>
[JsonPropertyName("source_format")]
public string? SourceFormat { get; init; } = null;
/// <summary>
/// Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
///
/// Populated during derivation from the internal document representation.
/// Empty when no relationships are detected.
/// </summary>
[JsonPropertyName("relationships")]
public List<DocumentRelationship> Relationships { get; init; } = [];
/// <summary>
/// Sorted, deduplicated list of node type names present in this document.
///
/// Each value is the snake_case `node_type` tag of the corresponding
/// `NodeContent` variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
///
/// Computed from `nodes` via `DocumentStructure.finalize_node_types`.
/// Empty until that method is called (internal construction paths call it
/// at the end of derivation).
/// </summary>
[JsonPropertyName("node_types")]
public List<string> NodeTypes { get; init; } = [];
/// <summary>
/// Parse a <see cref="DocumentStructure"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentStructure FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentStructure>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentStructure from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentStructure from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static DocumentStructure Default()
{
var nativeResult = NativeMethods.DocumentStructureDefault();
var jsonPtr = NativeMethods.DocumentStructureToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.DocumentStructureFree(nativeResult);
return JsonSerializer.Deserialize<DocumentStructure>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,154 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Application properties from docProps/app.xml for DOCX
///
/// Contains Word-specific document statistics and metadata.
/// </summary>
public sealed record DocxAppProperties
{
/// <summary>
/// Application name (e.g., "Microsoft Office Word")
/// </summary>
[JsonPropertyName("application")]
public string? Application { get; init; } = null;
/// <summary>
/// Application version
/// </summary>
[JsonPropertyName("app_version")]
public string? AppVersion { get; init; } = null;
/// <summary>
/// Template filename
/// </summary>
[JsonPropertyName("template")]
public string? Template { get; init; } = null;
/// <summary>
/// Total editing time in minutes
/// </summary>
[JsonPropertyName("total_time")]
public int? TotalTime { get; init; } = null;
/// <summary>
/// Number of pages
/// </summary>
[JsonPropertyName("pages")]
public int? Pages { get; init; } = null;
/// <summary>
/// Number of words
/// </summary>
[JsonPropertyName("words")]
public int? Words { get; init; } = null;
/// <summary>
/// Number of characters (excluding spaces)
/// </summary>
[JsonPropertyName("characters")]
public int? Characters { get; init; } = null;
/// <summary>
/// Number of characters (including spaces)
/// </summary>
[JsonPropertyName("characters_with_spaces")]
public int? CharactersWithSpaces { get; init; } = null;
/// <summary>
/// Number of lines
/// </summary>
[JsonPropertyName("lines")]
public int? Lines { get; init; } = null;
/// <summary>
/// Number of paragraphs
/// </summary>
[JsonPropertyName("paragraphs")]
public int? Paragraphs { get; init; } = null;
/// <summary>
/// Company name
/// </summary>
[JsonPropertyName("company")]
public string? Company { get; init; } = null;
/// <summary>
/// Document security level
/// </summary>
[JsonPropertyName("doc_security")]
public int? DocSecurity { get; init; } = null;
/// <summary>
/// Scale crop flag
/// </summary>
[JsonPropertyName("scale_crop")]
public bool? ScaleCrop { get; init; } = null;
/// <summary>
/// Links up to date flag
/// </summary>
[JsonPropertyName("links_up_to_date")]
public bool? LinksUpToDate { get; init; } = null;
/// <summary>
/// Shared document flag
/// </summary>
[JsonPropertyName("shared_doc")]
public bool? SharedDoc { get; init; } = null;
/// <summary>
/// Hyperlinks changed flag
/// </summary>
[JsonPropertyName("hyperlinks_changed")]
public bool? HyperlinksChanged { get; init; } = null;
/// <summary>
/// Parse a <see cref="DocxAppProperties"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocxAppProperties FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocxAppProperties>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocxAppProperties from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocxAppProperties from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Word document metadata.
///
/// Extracted from DOCX files using shared Office Open XML metadata extraction.
/// Integrates with `office_metadata` module for core/app/custom properties.
/// </summary>
public sealed record DocxMetadata
{
/// <summary>
/// Core properties from docProps/core.xml (Dublin Core metadata)
///
/// Contains title, creator, subject, keywords, dates, etc.
/// Shared format across DOCX/PPTX/XLSX documents.
/// </summary>
[JsonPropertyName("core_properties")]
public CoreProperties? CoreProperties { get; init; } = null;
/// <summary>
/// Application properties from docProps/app.xml (Word-specific statistics)
///
/// Contains word count, page count, paragraph count, editing time, etc.
/// DOCX-specific variant of Office application properties.
/// </summary>
[JsonPropertyName("app_properties")]
public DocxAppProperties? AppProperties { get; init; } = null;
/// <summary>
/// Custom properties from docProps/custom.xml (user-defined properties)
///
/// Contains key-value pairs defined by users or applications.
/// Values can be strings, numbers, booleans, or dates.
/// </summary>
[JsonPropertyName("custom_properties")]
public Dictionary<string, string>? CustomProperties { get; init; } = null;
/// <summary>
/// Parse a <see cref="DocxMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocxMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocxMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocxMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocxMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

83
packages/csharp/src/Kreuzberg/Element.cs generated Normal file
View File

@@ -0,0 +1,83 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Semantic element extracted from document.
///
/// Represents a logical unit of content with semantic classification,
/// unique identifier, and metadata for tracking origin and position.
/// </summary>
public sealed record Element
{
/// <summary>
/// Unique element identifier
/// </summary>
[JsonPropertyName("element_id")]
public required string ElementId { get; init; }
/// <summary>
/// Semantic type of this element
/// </summary>
[JsonPropertyName("element_type")]
public required ElementType ElementType { get; init; }
/// <summary>
/// Text content of the element
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// Metadata about the element
/// </summary>
[JsonPropertyName("metadata")]
public required ElementMetadata Metadata { get; init; }
/// <summary>
/// Parse a <see cref="Element"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static Element FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<Element>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse Element from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse Element from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Metadata for a semantic element.
/// </summary>
public sealed record ElementMetadata
{
/// <summary>
/// Page number (1-indexed)
/// </summary>
[JsonPropertyName("page_number")]
public uint? PageNumber { get; init; } = null;
/// <summary>
/// Source filename or document name
/// </summary>
[JsonPropertyName("filename")]
public string? Filename { get; init; } = null;
/// <summary>
/// Bounding box coordinates if available
/// </summary>
[JsonPropertyName("coordinates")]
public BoundingBox? Coordinates { get; init; } = null;
/// <summary>
/// Position index in the element sequence
/// </summary>
[JsonPropertyName("element_index")]
public ulong? ElementIndex { get; init; } = null;
/// <summary>
/// Additional custom metadata
/// </summary>
[JsonPropertyName("additional")]
public Dictionary<string, string> Additional { get; init; } = default!;
/// <summary>
/// Parse a <see cref="ElementMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ElementMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ElementMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ElementMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ElementMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,121 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Semantic element type classification.
///
/// Categorizes text content into semantic units for downstream processing.
/// Supports the element types commonly found in Unstructured documents.
/// </summary>
[JsonConverter(typeof(ElementTypeJsonConverter))]
public enum ElementType
{
/// <summary>
/// Document title
/// </summary>
[JsonPropertyName("title")]
Title,
/// <summary>
/// Main narrative text body
/// </summary>
[JsonPropertyName("narrative_text")]
NarrativeText,
/// <summary>
/// Section heading
/// </summary>
[JsonPropertyName("heading")]
Heading,
/// <summary>
/// List item (bullet, numbered, etc.)
/// </summary>
[JsonPropertyName("list_item")]
ListItem,
/// <summary>
/// Table element
/// </summary>
[JsonPropertyName("table")]
Table,
/// <summary>
/// Image element
/// </summary>
[JsonPropertyName("image")]
Image,
/// <summary>
/// Page break marker
/// </summary>
[JsonPropertyName("page_break")]
PageBreak,
/// <summary>
/// Code block
/// </summary>
[JsonPropertyName("code_block")]
CodeBlock,
/// <summary>
/// Block quote
/// </summary>
[JsonPropertyName("block_quote")]
BlockQuote,
/// <summary>
/// Footer text
/// </summary>
[JsonPropertyName("footer")]
Footer,
/// <summary>
/// Header text
/// </summary>
[JsonPropertyName("header")]
Header,
}
/// <summary>
/// Custom JSON converter for <see cref="ElementType"/> that respects explicit variant names.
/// </summary>
internal sealed class ElementTypeJsonConverter : JsonConverter<ElementType>
{
public override ElementType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"title" => ElementType.Title,
"narrative_text" => ElementType.NarrativeText,
"heading" => ElementType.Heading,
"list_item" => ElementType.ListItem,
"table" => ElementType.Table,
"image" => ElementType.Image,
"page_break" => ElementType.PageBreak,
"code_block" => ElementType.CodeBlock,
"block_quote" => ElementType.BlockQuote,
"footer" => ElementType.Footer,
"header" => ElementType.Header,
_ => throw new JsonException($"Unknown ElementType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ElementType value, JsonSerializerOptions options)
{
var str = value switch
{
ElementType.Title => "title",
ElementType.NarrativeText => "narrative_text",
ElementType.Heading => "heading",
ElementType.ListItem => "list_item",
ElementType.Table => "table",
ElementType.Image => "image",
ElementType.PageBreak => "page_break",
ElementType.CodeBlock => "code_block",
ElementType.BlockQuote => "block_quote",
ElementType.Footer => "footer",
ElementType.Header => "header",
_ => throw new JsonException($"Unknown ElementType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,95 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Email attachment representation.
///
/// Contains metadata and optionally the content of an email attachment.
/// </summary>
public sealed record EmailAttachment
{
/// <summary>
/// Attachment name (from Content-Disposition header)
/// </summary>
[JsonPropertyName("name")]
public string? Name { get; init; } = null;
/// <summary>
/// Filename of the attachment
/// </summary>
[JsonPropertyName("filename")]
public string? Filename { get; init; } = null;
/// <summary>
/// MIME type of the attachment
/// </summary>
[JsonPropertyName("mime_type")]
public string? MimeType { get; init; } = null;
/// <summary>
/// Size in bytes
/// </summary>
[JsonPropertyName("size")]
public ulong? Size { get; init; } = null;
/// <summary>
/// Whether this attachment is an image
/// </summary>
[JsonPropertyName("is_image")]
public bool IsImage { get; init; } = false;
/// <summary>
/// Attachment data (if extracted).
/// Uses `bytes.Bytes` for cheap cloning of large buffers.
/// </summary>
[JsonPropertyName("data")]
public byte[]? Data { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmailAttachment"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailAttachment FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailAttachment>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailAttachment from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailAttachment from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,79 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Configuration for email extraction.
/// </summary>
public sealed record EmailConfig
{
/// <summary>
/// Windows codepage number to use when an MSG file contains no codepage property.
/// Defaults to `None`, which falls back to windows-1252.
///
/// If an unrecognized or invalid codepage number is supplied (including 0),
/// the behavior silently falls back to windows-1252 — the same as when the
/// MSG file itself contains an unrecognized codepage. No error or warning is
/// emitted. Users should verify output when supplying unusual values.
///
/// Common values:
/// - 1250: Central European (Polish, Czech, Hungarian, etc.)
/// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
/// - 1252: Western European (default)
/// - 1253: Greek
/// - 1254: Turkish
/// - 1255: Hebrew
/// - 1256: Arabic
/// - 932: Japanese (Shift-JIS)
/// - 936: Simplified Chinese (GBK)
/// </summary>
[JsonPropertyName("msg_fallback_codepage")]
public uint? MsgFallbackCodepage { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmailConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,131 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Email extraction result.
///
/// Complete representation of an extracted email message (.eml or .msg)
/// including headers, body content, and attachments.
/// </summary>
public sealed record EmailExtractionResult
{
/// <summary>
/// Email subject line
/// </summary>
[JsonPropertyName("subject")]
public string? Subject { get; init; } = null;
/// <summary>
/// Sender email address
/// </summary>
[JsonPropertyName("from_email")]
public string? FromEmail { get; init; } = null;
/// <summary>
/// Primary recipient email addresses
/// </summary>
[JsonPropertyName("to_emails")]
public List<string> ToEmails { get; init; } = [];
/// <summary>
/// CC recipient email addresses
/// </summary>
[JsonPropertyName("cc_emails")]
public List<string> CcEmails { get; init; } = [];
/// <summary>
/// BCC recipient email addresses
/// </summary>
[JsonPropertyName("bcc_emails")]
public List<string> BccEmails { get; init; } = [];
/// <summary>
/// Email date/timestamp
/// </summary>
[JsonPropertyName("date")]
public string? Date { get; init; } = null;
/// <summary>
/// Message-ID header value
/// </summary>
[JsonPropertyName("message_id")]
public string? MessageId { get; init; } = null;
/// <summary>
/// Plain text version of the email body
/// </summary>
[JsonPropertyName("plain_text")]
public string? PlainText { get; init; } = null;
/// <summary>
/// HTML version of the email body
/// </summary>
[JsonPropertyName("html_content")]
public string? HtmlContent { get; init; } = null;
/// <summary>
/// Cleaned/processed text content. Aliased as `cleaned_text` for back-compat.
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// List of email attachments
/// </summary>
[JsonPropertyName("attachments")]
public List<EmailAttachment> Attachments { get; init; } = [];
/// <summary>
/// Additional email headers and metadata
/// </summary>
[JsonPropertyName("metadata")]
public Dictionary<string, string> Metadata { get; init; } = default!;
/// <summary>
/// Parse a <see cref="EmailExtractionResult"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailExtractionResult FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailExtractionResult>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailExtractionResult from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailExtractionResult from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,100 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Email metadata extracted from .eml and .msg files.
///
/// Includes sender/recipient information, message ID, and attachment list.
/// </summary>
public sealed record EmailMetadata
{
/// <summary>
/// Sender's email address
/// </summary>
[JsonPropertyName("from_email")]
public string? FromEmail { get; init; } = null;
/// <summary>
/// Sender's display name
/// </summary>
[JsonPropertyName("from_name")]
public string? FromName { get; init; } = null;
/// <summary>
/// Primary recipients
/// </summary>
[JsonPropertyName("to_emails")]
public List<string> ToEmails { get; init; } = [];
/// <summary>
/// CC recipients
/// </summary>
[JsonPropertyName("cc_emails")]
public List<string> CcEmails { get; init; } = [];
/// <summary>
/// BCC recipients
/// </summary>
[JsonPropertyName("bcc_emails")]
public List<string> BccEmails { get; init; } = [];
/// <summary>
/// Message-ID header value
/// </summary>
[JsonPropertyName("message_id")]
public string? MessageId { get; init; } = null;
/// <summary>
/// List of attachment filenames
/// </summary>
[JsonPropertyName("attachments")]
public List<string> Attachments { get; init; } = [];
/// <summary>
/// Parse a <see cref="EmailMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,76 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Changes to embedded archive children between two results.
/// </summary>
public sealed record EmbeddedChanges
{
/// <summary>
/// Children present in `b` but not in `a` (matched by `path`).
/// </summary>
[JsonPropertyName("added")]
public List<ArchiveEntry> Added { get; init; } = [];
/// <summary>
/// Children present in `a` but not in `b` (matched by `path`).
/// </summary>
[JsonPropertyName("removed")]
public List<ArchiveEntry> Removed { get; init; } = [];
/// <summary>
/// Children present in both but with differing content (matched by `path`).
///
/// Each entry holds the diff of the nested `ExtractionResult`.
/// </summary>
[JsonPropertyName("changed")]
public List<EmbeddedDiff> Changed { get; init; } = [];
/// <summary>
/// Parse a <see cref="EmbeddedChanges"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddedChanges FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddedChanges>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddedChanges from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddedChanges from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Diff for a single embedded archive entry that appears in both results.
/// </summary>
public sealed record EmbeddedDiff
{
/// <summary>
/// Archive-relative path identifying this entry.
/// </summary>
[JsonPropertyName("path")]
public required string Path { get; init; }
/// <summary>
/// The recursive diff of the entry's extraction result.
/// </summary>
[JsonPropertyName("diff")]
public required ExtractionDiff Diff { get; init; }
/// <summary>
/// Parse a <see cref="EmbeddedDiff"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddedDiff FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddedDiff>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddedDiff from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddedDiff from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Embedded file descriptor extracted from the PDF name tree.
/// </summary>
public sealed record EmbeddedFile
{
/// <summary>
/// The filename as stored in the PDF name tree.
/// </summary>
[JsonPropertyName("name")]
public required string Name { get; init; }
/// <summary>
/// Raw file bytes from the embedded stream (already decompressed by lopdf).
/// </summary>
[JsonConverter(typeof(ByteArrayToIntArrayConverter))]
[JsonPropertyName("data")]
public byte[] Data { get; init; } = [];
/// <summary>
/// Compressed byte count of the original stream (before decompression).
///
/// Used by callers to compute the decompression ratio and detect zip-bomb-style
/// attacks that embed a tiny compressed stream expanding to gigabytes of data.
/// </summary>
[JsonPropertyName("compressed_size")]
public ulong CompressedSize { get; init; } = 0;
/// <summary>
/// MIME type if specified in the filespec, otherwise `None`.
/// </summary>
[JsonPropertyName("mime_type")]
public string? MimeType { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmbeddedFile"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddedFile FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddedFile>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddedFile from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddedFile from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,127 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Embedding configuration for text chunks.
///
/// Configures embedding generation using ONNX models via the vendored embedding engine.
/// Requires the `embeddings` feature to be enabled.
/// </summary>
public sealed record EmbeddingConfig
{
/// <summary>
/// The embedding model to use (defaults to "balanced" preset if not specified)
/// </summary>
[JsonPropertyName("model")]
public EmbeddingModelType? Model { get; init; } = null;
/// <summary>
/// Whether to normalize embedding vectors (recommended for cosine similarity)
/// </summary>
[JsonPropertyName("normalize")]
public bool Normalize { get; init; } = true;
/// <summary>
/// Batch size for embedding generation
/// </summary>
[JsonPropertyName("batch_size")]
public ulong BatchSize { get; init; } = 32;
/// <summary>
/// Show model download progress
/// </summary>
[JsonPropertyName("show_download_progress")]
public bool ShowDownloadProgress { get; init; } = false;
/// <summary>
/// Custom cache directory for model files
///
/// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
/// Allows full customization of model download location.
/// </summary>
[JsonPropertyName("cache_dir")]
public string? CacheDir { get; init; } = null;
/// <summary>
/// Hardware acceleration for the embedding ONNX model.
///
/// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
/// is used for inference. Defaults to `None` (auto-select per platform).
/// </summary>
[JsonPropertyName("acceleration")]
public AccelerationConfig? Acceleration { get; init; } = null;
/// <summary>
/// Maximum wall-clock duration (in seconds) for a single `embed()` call when
/// using `EmbeddingModelType.Plugin`.
///
/// Applies only to the in-process plugin path — protects against hung
/// host-language backends (e.g. a Python callback deadlocked on the GIL,
/// a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
/// returns `Plugin` instead of blocking forever.
///
/// `None` disables the timeout. The default (60 seconds) is conservative
/// for common in-process inference; increase for large batches on slow
/// hardware.
/// </summary>
[JsonPropertyName("max_embed_duration_secs")]
public ulong? MaxEmbedDurationSecs { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmbeddingConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddingConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddingConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddingConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddingConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static EmbeddingConfig Default()
{
var nativeResult = NativeMethods.EmbeddingConfigDefault();
var jsonPtr = NativeMethods.EmbeddingConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.EmbeddingConfigFree(nativeResult);
return JsonSerializer.Deserialize<EmbeddingConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class EmbeddingException : KreuzbergErrorException
{
public EmbeddingException(string message) : base(message) { }
public EmbeddingException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,185 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Embedding model types supported by Kreuzberg.
/// </summary>
[JsonConverter(typeof(EmbeddingModelTypeJsonConverter))]
public abstract record EmbeddingModelType
{
/// <summary>
/// Use a preset model configuration (recommended)
/// </summary>
public sealed record Preset(
[property: JsonPropertyName("name")] string Name
) : EmbeddingModelType;
/// <summary>
/// Use a custom ONNX model from HuggingFace
/// </summary>
public sealed record Custom(
[property: JsonPropertyName("model_id")] string ModelId,
[property: JsonPropertyName("dimensions")] ulong Dimensions
) : EmbeddingModelType;
/// <summary>
/// Provider-hosted embedding model via liter-llm.
///
/// Uses the model specified in the nested `LlmConfig` (e.g.,
/// `"openai/text-embedding-3-small"`).
/// </summary>
public sealed record Llm(
[property: JsonPropertyName("llm")] LlmConfig Value
) : EmbeddingModelType;
/// <summary>
/// In-process embedding backend registered via the plugin system.
///
/// The caller registers an `EmbeddingBackend`(crate.plugins.EmbeddingBackend) once
/// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
/// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
/// into the registered backend during chunking and standalone embed requests —
/// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
///
/// When this variant is selected, only the following `EmbeddingConfig` fields
/// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
/// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
/// `show_download_progress`, `acceleration`) are ignored — the host owns the
/// model lifecycle.
///
/// Semantic chunking falls back to `ChunkingConfig.max_characters` when this variant
/// is used, since there is no preset to look a chunk-size ceiling up against — size your
/// context window via `max_characters` directly.
///
/// See `register_embedding_backend`.
/// </summary>
public sealed record Plugin(
[property: JsonPropertyName("name")] string Name
) : EmbeddingModelType;
}
/// <summary>
/// Custom converter for EmbeddingModelType sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class EmbeddingModelTypeJsonConverter : JsonConverter<EmbeddingModelType>
{
public override EmbeddingModelType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "preset" => JsonSerializer.Deserialize<EmbeddingModelType.Preset>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "custom" => JsonSerializer.Deserialize<EmbeddingModelType.Custom>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "llm" => JsonSerializer.Deserialize<EmbeddingModelType.Llm>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "plugin" => JsonSerializer.Deserialize<EmbeddingModelType.Plugin>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown EmbeddingModelType discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, EmbeddingModelType value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case EmbeddingModelType.Preset v_preset:
tag = "preset"; inner = v_preset; break; case EmbeddingModelType.Custom v_custom:
tag = "custom"; inner = v_custom; break; case EmbeddingModelType.Llm v_llm:
tag = "llm"; inner = v_llm; break; case EmbeddingModelType.Plugin v_plugin:
tag = "plugin"; inner = v_plugin; break; default:
throw new JsonException($"Unknown EmbeddingModelType variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,95 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Preset configurations for common RAG use cases.
///
/// Each preset combines chunk size, overlap, and embedding model
/// to provide an optimized configuration for specific scenarios.
///
/// All string fields are owned `String` for FFI compatibility — instances
/// are safe to clone and pass across language boundaries.
/// </summary>
public sealed record EmbeddingPreset
{
[JsonPropertyName("name")]
public required string Name { get; init; }
[JsonPropertyName("chunk_size")]
public ulong ChunkSize { get; init; } = 0;
[JsonPropertyName("overlap")]
public ulong Overlap { get; init; } = 0;
/// <summary>
/// HuggingFace repository name for the model.
/// </summary>
[JsonPropertyName("model_repo")]
public required string ModelRepo { get; init; }
/// <summary>
/// Pooling strategy: "cls" or "mean".
/// </summary>
[JsonPropertyName("pooling")]
public required string Pooling { get; init; }
/// <summary>
/// Path to the ONNX model file within the repo.
/// </summary>
[JsonPropertyName("model_file")]
public required string ModelFile { get; init; }
[JsonPropertyName("dimensions")]
public ulong Dimensions { get; init; } = 0;
[JsonPropertyName("description")]
public required string Description { get; init; }
/// <summary>
/// Parse a <see cref="EmbeddingPreset"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddingPreset FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddingPreset>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddingPreset from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddingPreset from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// EPUB metadata (Dublin Core extensions).
/// </summary>
public sealed record EpubMetadata
{
[JsonPropertyName("coverage")]
public string? Coverage { get; init; } = null;
[JsonPropertyName("dc_format")]
public string? DcFormat { get; init; } = null;
[JsonPropertyName("relation")]
public string? Relation { get; init; } = null;
[JsonPropertyName("source")]
public string? Source { get; init; } = null;
[JsonPropertyName("dc_type")]
public string? DcType { get; init; } = null;
[JsonPropertyName("cover_image")]
public string? CoverImage { get; init; } = null;
/// <summary>
/// Parse a <see cref="EpubMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EpubMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EpubMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EpubMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EpubMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Error metadata (for batch operations).
/// </summary>
public sealed record ErrorMetadata
{
[JsonPropertyName("error_type")]
public required string ErrorType { get; init; }
[JsonPropertyName("message")]
public required string Message { get; init; }
/// <summary>
/// Parse a <see cref="ErrorMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ErrorMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ErrorMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ErrorMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ErrorMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Excel/spreadsheet format metadata.
///
/// Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
/// discriminant. Sheet count and sheet names are stored inside this struct.
/// </summary>
public sealed record ExcelMetadata
{
/// <summary>
/// Number of sheets in the workbook.
/// </summary>
[JsonPropertyName("sheet_count")]
public uint? SheetCount { get; init; } = null;
/// <summary>
/// Names of all sheets in the workbook.
/// </summary>
[JsonPropertyName("sheet_names")]
public List<string>? SheetNames { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExcelMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExcelMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExcelMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExcelMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExcelMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,97 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Single Excel worksheet.
///
/// Represents one sheet from an Excel workbook with its content
/// converted to Markdown format and dimensional statistics.
/// </summary>
public sealed record ExcelSheet
{
/// <summary>
/// Sheet name as it appears in Excel
/// </summary>
[JsonPropertyName("name")]
public required string Name { get; init; }
/// <summary>
/// Sheet content converted to Markdown tables
/// </summary>
[JsonPropertyName("markdown")]
public required string Markdown { get; init; }
/// <summary>
/// Number of rows
/// </summary>
[JsonPropertyName("row_count")]
public ulong RowCount { get; init; } = 0;
/// <summary>
/// Number of columns
/// </summary>
[JsonPropertyName("col_count")]
public ulong ColCount { get; init; } = 0;
/// <summary>
/// Total number of non-empty cells
/// </summary>
[JsonPropertyName("cell_count")]
public ulong CellCount { get; init; } = 0;
/// <summary>
/// Pre-extracted table cells (2D vector of cell values)
/// Populated during markdown generation to avoid re-parsing markdown.
/// null for empty sheets.
/// </summary>
[JsonPropertyName("table_cells")]
public List<List<string>>? TableCells { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExcelSheet"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExcelSheet FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExcelSheet>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExcelSheet from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExcelSheet from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Excel workbook representation.
///
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
/// extracted content and metadata.
/// </summary>
public sealed record ExcelWorkbook
{
/// <summary>
/// All sheets in the workbook
/// </summary>
[JsonPropertyName("sheets")]
public List<ExcelSheet> Sheets { get; init; } = [];
/// <summary>
/// Workbook-level metadata (author, creation date, etc.)
/// </summary>
[JsonPropertyName("metadata")]
public Dictionary<string, string> Metadata { get; init; } = default!;
/// <summary>
/// Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
///
/// Populated for legacy shared-workbook `.xlsx` files that contain the
/// `xl/revisions/` directory. Each `&lt;header&gt;` element maps to one
/// `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
/// (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
/// `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
/// follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
/// </summary>
[JsonPropertyName("revisions")]
public List<DocumentRevision>? Revisions { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExcelWorkbook"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExcelWorkbook FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExcelWorkbook>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExcelWorkbook from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExcelWorkbook from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,79 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// ONNX Runtime execution provider type.
///
/// Determines which hardware backend is used for model inference.
/// `Auto` (default) selects the best available provider per platform.
/// </summary>
[JsonConverter(typeof(ExecutionProviderTypeJsonConverter))]
public enum ExecutionProviderType
{
/// <summary>
/// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
/// </summary>
[JsonPropertyName("auto")]
Auto,
/// <summary>
/// CPU execution provider (always available).
/// </summary>
[JsonPropertyName("cpu")]
Cpu,
/// <summary>
/// Apple CoreML (macOS/iOS Neural Engine + GPU).
/// </summary>
[JsonPropertyName("coreml")]
CoreMl,
/// <summary>
/// NVIDIA CUDA GPU acceleration.
/// </summary>
[JsonPropertyName("cuda")]
Cuda,
/// <summary>
/// NVIDIA TensorRT (optimized CUDA inference).
/// </summary>
[JsonPropertyName("tensorrt")]
TensorRt,
}
/// <summary>
/// Custom JSON converter for <see cref="ExecutionProviderType"/> that respects explicit variant names.
/// </summary>
internal sealed class ExecutionProviderTypeJsonConverter : JsonConverter<ExecutionProviderType>
{
public override ExecutionProviderType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"auto" => ExecutionProviderType.Auto,
"cpu" => ExecutionProviderType.Cpu,
"coreml" => ExecutionProviderType.CoreMl,
"cuda" => ExecutionProviderType.Cuda,
"tensorrt" => ExecutionProviderType.TensorRt,
_ => throw new JsonException($"Unknown ExecutionProviderType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ExecutionProviderType value, JsonSerializerOptions options)
{
var str = value switch
{
ExecutionProviderType.Auto => "auto",
ExecutionProviderType.Cpu => "cpu",
ExecutionProviderType.CoreMl => "coreml",
ExecutionProviderType.Cuda => "cuda",
ExecutionProviderType.TensorRt => "tensorrt",
_ => throw new JsonException($"Unknown ExecutionProviderType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,166 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Extracted image from a document.
///
/// Contains raw image data, metadata, and optional nested OCR results.
/// Raw bytes allow cross-language compatibility - users can convert to
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
/// </summary>
public sealed record ExtractedImage
{
/// <summary>
/// Raw image data (PNG, JPEG, WebP, etc. bytes).
/// Uses `bytes.Bytes` for cheap cloning of large buffers.
/// </summary>
[JsonConverter(typeof(ByteArrayToIntArrayConverter))]
[JsonPropertyName("data")]
public byte[] Data { get; init; } = [];
/// <summary>
/// Image format (e.g., "jpeg", "png", "webp")
/// Uses Cow&lt;, str&gt; to avoid allocation for static literals.
/// </summary>
[JsonPropertyName("format")]
public required string Format { get; init; }
/// <summary>
/// Zero-indexed position of this image in the document/page
/// </summary>
[JsonPropertyName("image_index")]
public uint ImageIndex { get; init; } = 0;
/// <summary>
/// Page/slide number where image was found (1-indexed)
/// </summary>
[JsonPropertyName("page_number")]
public uint? PageNumber { get; init; } = null;
/// <summary>
/// Image width in pixels
/// </summary>
[JsonPropertyName("width")]
public uint? Width { get; init; } = null;
/// <summary>
/// Image height in pixels
/// </summary>
[JsonPropertyName("height")]
public uint? Height { get; init; } = null;
/// <summary>
/// Colorspace information (e.g., "RGB", "CMYK", "Gray")
/// </summary>
[JsonPropertyName("colorspace")]
public string? Colorspace { get; init; } = null;
/// <summary>
/// Bits per color component (e.g., 8, 16)
/// </summary>
[JsonPropertyName("bits_per_component")]
public uint? BitsPerComponent { get; init; } = null;
/// <summary>
/// Whether this image is a mask image
/// </summary>
[JsonPropertyName("is_mask")]
public bool IsMask { get; init; } = false;
/// <summary>
/// Optional description of the image
/// </summary>
[JsonPropertyName("description")]
public string? Description { get; init; } = null;
/// <summary>
/// Nested OCR extraction result (if image was OCRed)
///
/// When OCR is performed on this image, the result is embedded here
/// rather than in a separate collection, making the relationship explicit.
/// </summary>
[JsonPropertyName("ocr_result")]
public ExtractionResult? OcrResult { get; init; } = null;
/// <summary>
/// Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
/// Only populated for PDF-extracted images when position data is available from the PDF extractor.
/// </summary>
[JsonPropertyName("bounding_box")]
public BoundingBox? BoundingBox { get; init; } = null;
/// <summary>
/// Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
/// Used for rendering image references when the binary data is not extracted.
/// </summary>
[JsonPropertyName("source_path")]
public string? SourcePath { get; init; } = null;
/// <summary>
/// Heuristic classification of what this image likely depicts.
/// `None` if classification was disabled or inconclusive.
/// </summary>
[JsonPropertyName("image_kind")]
public ImageKind? ImageKind { get; init; } = null;
/// <summary>
/// Confidence score for `image_kind`, in the range 0.0 to 1.0.
/// </summary>
[JsonPropertyName("kind_confidence")]
public float? KindConfidence { get; init; } = null;
/// <summary>
/// Identifier shared across images that form a single logical figure
/// (e.g. all raster tiles of one technical drawing). `None` for singletons.
/// </summary>
[JsonPropertyName("cluster_id")]
public uint? ClusterId { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExtractedImage"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractedImage FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractedImage>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractedImage from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractedImage from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A URI extracted from a document.
///
/// Represents any link, reference, or resource pointer found during extraction.
/// The `kind` field classifies the URI semantically, while `label` carries
/// optional human-readable display text.
/// </summary>
public sealed record ExtractedUri
{
/// <summary>
/// The URL or path string.
/// </summary>
[JsonPropertyName("url")]
public required string Url { get; init; }
/// <summary>
/// Optional display text / label for the link.
/// </summary>
[JsonPropertyName("label")]
public string? Label { get; init; } = null;
/// <summary>
/// Optional page number where the URI was found (1-indexed).
/// </summary>
[JsonPropertyName("page")]
public uint? Page { get; init; } = null;
/// <summary>
/// Semantic classification of the URI.
/// </summary>
[JsonPropertyName("kind")]
public required UriKind Kind { get; init; }
/// <summary>
/// Parse a <see cref="ExtractedUri"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractedUri FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractedUri>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractedUri from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractedUri from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,405 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Main extraction configuration.
///
/// This struct contains all configuration options for the extraction process.
/// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
/// </summary>
public sealed record ExtractionConfig
{
/// <summary>
/// Enable caching of extraction results
/// </summary>
[JsonPropertyName("use_cache")]
public bool UseCache { get; init; } = true;
/// <summary>
/// Enable quality post-processing
/// </summary>
[JsonPropertyName("enable_quality_processing")]
public bool EnableQualityProcessing { get; init; } = true;
/// <summary>
/// OCR configuration (null = OCR disabled)
/// </summary>
[JsonPropertyName("ocr")]
public OcrConfig? Ocr { get; init; } = null;
/// <summary>
/// Force OCR even for searchable PDFs
/// </summary>
[JsonPropertyName("force_ocr")]
public bool ForceOcr { get; init; } = false;
/// <summary>
/// Force OCR on specific pages only (1-indexed page numbers, must be &gt;= 1).
///
/// When set, only the listed pages are OCR'd regardless of text layer quality.
/// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
/// Only applies to PDF documents. Duplicates are automatically deduplicated.
/// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
/// </summary>
[JsonPropertyName("force_ocr_pages")]
public List<uint>? ForceOcrPages { get; init; } = null;
/// <summary>
/// Disable OCR entirely, even for images.
///
/// When `true`, OCR is skipped for all document types. Images return metadata
/// only (dimensions, format, EXIF) without text extraction. PDFs use only
/// native text extraction without OCR fallback.
///
/// Cannot be `true` simultaneously with `force_ocr`.
///
/// *Added in v4.7.0.*
/// </summary>
[JsonPropertyName("disable_ocr")]
public bool DisableOcr { get; init; } = false;
/// <summary>
/// Text chunking configuration (null = chunking disabled)
/// </summary>
[JsonPropertyName("chunking")]
public ChunkingConfig? Chunking { get; init; } = null;
/// <summary>
/// Content filtering configuration (null = use extractor defaults).
///
/// Controls whether document "furniture" (headers, footers, watermarks,
/// repeating text) is included in or stripped from extraction results.
/// See `ContentFilterConfig` for per-field documentation.
/// </summary>
[JsonPropertyName("content_filter")]
public ContentFilterConfig? ContentFilter { get; init; } = null;
/// <summary>
/// Image extraction configuration (null = no image extraction)
/// </summary>
[JsonPropertyName("images")]
public ImageExtractionConfig? Images { get; init; } = null;
/// <summary>
/// PDF-specific options (null = use defaults)
/// </summary>
[JsonPropertyName("pdf_options")]
public PdfConfig? PdfOptions { get; init; } = null;
/// <summary>
/// Token reduction configuration (null = no token reduction)
/// </summary>
[JsonPropertyName("token_reduction")]
public TokenReductionOptions? TokenReduction { get; init; } = null;
/// <summary>
/// Language detection configuration (null = no language detection)
/// </summary>
[JsonPropertyName("language_detection")]
public LanguageDetectionConfig? LanguageDetection { get; init; } = null;
/// <summary>
/// Page extraction configuration (null = no page tracking)
/// </summary>
[JsonPropertyName("pages")]
public PageConfig? Pages { get; init; } = null;
/// <summary>
/// Keyword extraction configuration (null = no keyword extraction)
/// </summary>
[JsonPropertyName("keywords")]
public KeywordConfig? Keywords { get; init; } = null;
/// <summary>
/// Post-processor configuration (null = use defaults)
/// </summary>
[JsonPropertyName("postprocessor")]
public PostProcessorConfig? Postprocessor { get; init; } = null;
/// <summary>
/// HTML to Markdown conversion options (null = use defaults)
///
/// Configure how HTML documents are converted to Markdown, including heading styles,
/// list formatting, code block styles, and preprocessing options.
/// </summary>
[JsonPropertyName("html_options")]
public string? HtmlOptions { get; init; } = null;
/// <summary>
/// Styled HTML output configuration.
///
/// When set alongside `output_format = OutputFormat.Html`, the extraction
/// pipeline uses `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer)
/// which emits stable `kb-*` CSS class hooks on every structural element
/// and optionally embeds theme CSS or user-supplied CSS in a `&lt;style&gt;` block.
///
/// When `None`, the existing plain comrak-based HTML renderer is used.
/// </summary>
[JsonPropertyName("html_output")]
public HtmlOutputConfig? HtmlOutput { get; init; } = null;
/// <summary>
/// Default per-file timeout in seconds for batch extraction.
///
/// When set, each file in a batch will be canceled after this duration
/// unless overridden by `FileExtractionConfig.timeout_secs`.
///
/// Defaults to `Some(60)` to prevent pathological files (e.g. deeply
/// nested archives, documents with millions of cells) from running
/// indefinitely and exhausting caller resources. Set to `None` to
/// disable the timeout for trusted input or long-running workloads.
/// </summary>
[JsonPropertyName("extraction_timeout_secs")]
public ulong? ExtractionTimeoutSecs { get; init; } = null;
/// <summary>
/// Maximum concurrent extractions in batch operations (null = (num_cpus × 1.5).ceil()).
///
/// Limits parallelism to prevent resource exhaustion when processing
/// large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
/// </summary>
[JsonPropertyName("max_concurrent_extractions")]
public ulong? MaxConcurrentExtractions { get; init; } = null;
/// <summary>
/// Result structure format
///
/// Controls whether results are returned in unified format (default) with all
/// content in the `content` field, or element-based format with semantic
/// elements (for Unstructured-compatible output).
/// </summary>
[JsonPropertyName("result_format")]
public ResultFormat? ResultFormat { get; init; } = null;
/// <summary>
/// Security limits for archive extraction.
///
/// Controls maximum archive size, compression ratio, file count, and other
/// security thresholds to prevent decompression bomb attacks. Also caps
/// nesting depth, iteration count, entity / token length, total
/// content size, and table cell count for every extraction path that
/// ingests user-controlled bytes.
/// When `None`, default limits are used.
/// </summary>
[JsonPropertyName("security_limits")]
public SecurityLimits? SecurityLimits { get; init; } = null;
/// <summary>
/// Maximum uncompressed size in bytes for a single embedded file before
/// recursive extraction is attempted (default: 50 MiB).
///
/// Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
/// to email attachments processed via recursive extraction. Files that
/// exceed this limit are skipped with a `ProcessingWarning` rather than
/// passed to the extraction pipeline, preventing a single oversized
/// embedded object from consuming unbounded memory or time.
///
/// Set to `None` to disable the per-embedded-file cap (falls back to
/// `security_limits.max_archive_size` as the only guard).
/// </summary>
[JsonPropertyName("max_embedded_file_bytes")]
public ulong? MaxEmbeddedFileBytes { get; init; } = null;
/// <summary>
/// Content text format (default: Plain).
///
/// Controls the format of the extracted content:
/// - `Plain`: Raw extracted text (default)
/// - `Markdown`: Markdown formatted output
/// - `Djot`: Djot markup format (requires djot feature)
/// - `Html`: HTML formatted output
///
/// When set to a structured format, extraction results will include
/// formatted output. The `formatted_content` field may be populated
/// when format conversion is applied.
/// </summary>
[JsonPropertyName("output_format")]
public OutputFormat OutputFormat { get; init; } = OutputFormat.Plain;
/// <summary>
/// Layout detection configuration (null = layout detection disabled).
///
/// When set, PDF pages and images are analyzed for document structure
/// (headings, code, formulas, tables, figures, etc.) using RT-DETR models
/// via ONNX Runtime. For PDFs, layout hints override paragraph classification
/// in the markdown pipeline. For images, per-region OCR is performed with
/// markdown formatting based on detected layout classes.
/// Requires the `layout-detection` feature to run inference; the field is
/// present whenever the `layout-types` feature is active (which includes
/// `layout-detection` as well as the no-ORT target groups).
/// </summary>
[JsonPropertyName("layout")]
public LayoutDetectionConfig? Layout { get; init; } = null;
/// <summary>
/// Run layout detection on the non-OCR PDF markdown path.
///
/// When `true` and `layout` is `Some(_)`, layout regions inform heading,
/// table, list, and figure detection in the structure pipeline that would
/// otherwise rely on font-clustering heuristics alone. Significantly
/// improves SF1 (structural F1) at the cost of inference latency
/// (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
/// Requires the `layout-detection` feature.
/// </summary>
[JsonPropertyName("use_layout_for_markdown")]
public bool UseLayoutForMarkdown { get; init; } = false;
/// <summary>
/// Enable structured document tree output.
///
/// When true, populates the `document` field on `ExtractionResult` with a
/// hierarchical `DocumentStructure` containing heading-driven section nesting,
/// table grids, content layer classification, and inline annotations.
///
/// Independent of `result_format` — can be combined with Unified or ElementBased.
/// </summary>
[JsonPropertyName("include_document_structure")]
public bool IncludeDocumentStructure { get; init; } = false;
/// <summary>
/// Hardware acceleration configuration for ONNX Runtime models.
///
/// Controls execution provider selection for layout detection and embedding
/// models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
/// Linux, CPU on Windows).
/// </summary>
[JsonPropertyName("acceleration")]
public AccelerationConfig? Acceleration { get; init; } = null;
/// <summary>
/// Cache namespace for tenant isolation.
///
/// When set, cache entries are stored under `{cache_dir}/{namespace}/`.
/// Must be alphanumeric, hyphens, or underscores only (max 64 chars).
/// Different namespaces have isolated cache spaces on the same filesystem.
/// </summary>
[JsonPropertyName("cache_namespace")]
public string? CacheNamespace { get; init; } = null;
/// <summary>
/// Per-request cache TTL in seconds.
///
/// Overrides the global `max_age_days` for this specific extraction.
/// When `0`, caching is completely skipped (no read or write).
/// When `None`, the global TTL applies.
/// </summary>
[JsonPropertyName("cache_ttl_secs")]
public ulong? CacheTtlSecs { get; init; } = null;
/// <summary>
/// Email extraction configuration (null = use defaults).
///
/// Currently supports configuring the fallback codepage for MSG files
/// that do not specify one. See `EmailConfig` for details.
/// </summary>
[JsonPropertyName("email")]
public EmailConfig? Email { get; init; } = null;
/// <summary>
/// Concurrency limits for constrained environments (null = use defaults).
///
/// Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
/// (when `max_concurrent_extractions` is unset) the batch concurrency
/// semaphore. See `ConcurrencyConfig` for details.
/// </summary>
[JsonPropertyName("concurrency")]
public string? Concurrency { get; init; } = null;
/// <summary>
/// Maximum recursion depth for archive extraction (default: 3).
/// Set to 0 to disable recursive extraction (legacy behavior).
/// </summary>
[JsonPropertyName("max_archive_depth")]
public ulong MaxArchiveDepth { get; init; } = 0;
/// <summary>
/// Tree-sitter language pack configuration (null = tree-sitter disabled).
///
/// When set, enables code file extraction using tree-sitter parsers.
/// Controls grammar download behavior and code analysis options.
/// </summary>
[JsonPropertyName("tree_sitter")]
public TreeSitterConfig? TreeSitter { get; init; } = null;
/// <summary>
/// Structured extraction via LLM (null = disabled).
///
/// When set, the extracted document content is sent to an LLM with the
/// provided JSON schema. The structured response is stored in
/// `ExtractionResult.structured_output`.
/// </summary>
[JsonPropertyName("structured_extraction")]
public StructuredExtractionConfig? StructuredExtraction { get; init; } = null;
/// <summary>
/// Cancellation token for this extraction (null = no external cancellation).
///
/// Pass a `CancellationToken` clone here and call `CancellationToken.cancel`
/// from another thread / task to abort the extraction in progress. The extractor
/// checks the token at safe checkpoints (before lock acquisition, between pages,
/// between batch items) and returns `KreuzbergError.Cancelled` when set.
///
/// The field is excluded from serialization because `CancellationToken` is a
/// runtime handle, not a configuration value.
/// </summary>
[JsonPropertyName("cancel_token")]
public string? CancelToken { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExtractionConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractionConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractionConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractionConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractionConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ExtractionConfig Default()
{
var nativeResult = NativeMethods.ExtractionConfigDefault();
var jsonPtr = NativeMethods.ExtractionConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ExtractionConfigFree(nativeResult);
return JsonSerializer.Deserialize<ExtractionConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,102 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// The complete diff between two `ExtractionResult` values.
/// </summary>
public sealed record ExtractionDiff
{
/// <summary>
/// Unified-diff hunks for the `content` field.
///
/// Empty when the content is identical.
/// </summary>
[JsonPropertyName("content_diff")]
public List<DiffHunk> ContentDiff { get; init; } = [];
/// <summary>
/// Tables present in `b` but not in `a` (by index position, excess right-side tables).
/// </summary>
[JsonPropertyName("tables_added")]
public List<Table> TablesAdded { get; init; } = [];
/// <summary>
/// Tables present in `a` but not in `b` (by index position, excess left-side tables).
/// </summary>
[JsonPropertyName("tables_removed")]
public List<Table> TablesRemoved { get; init; } = [];
/// <summary>
/// Cell-level changes for table pairs that share the same index and dimensions.
/// </summary>
[JsonPropertyName("tables_changed")]
public List<TableDiff> TablesChanged { get; init; } = [];
/// <summary>
/// Metadata difference, encoded as a JSON object with three top-level keys:
/// `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
/// but not `b`), and `changed` (keys whose values differ — each entry is
/// `{ "from": &lt;value-in-a&gt;, "to": &lt;value-in-b&gt; }`).
///
/// This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
/// to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
/// (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
/// preferred json-patch impl directly.
/// </summary>
[JsonPropertyName("metadata_changed")]
public required string MetadataChanged { get; init; }
/// <summary>
/// Changes to embedded archive children.
/// </summary>
[JsonPropertyName("embedded_changes")]
public required EmbeddedChanges EmbeddedChanges { get; init; }
/// <summary>
/// Parse a <see cref="ExtractionDiff"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractionDiff FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractionDiff>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractionDiff from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractionDiff from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,53 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// How the extracted text was produced.
/// </summary>
[JsonConverter(typeof(ExtractionMethodJsonConverter))]
public enum ExtractionMethod
{
[JsonPropertyName("native")]
Native,
[JsonPropertyName("ocr")]
Ocr,
[JsonPropertyName("mixed")]
Mixed,
}
/// <summary>
/// Custom JSON converter for <see cref="ExtractionMethod"/> that respects explicit variant names.
/// </summary>
internal sealed class ExtractionMethodJsonConverter : JsonConverter<ExtractionMethod>
{
public override ExtractionMethod Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"native" => ExtractionMethod.Native,
"ocr" => ExtractionMethod.Ocr,
"mixed" => ExtractionMethod.Mixed,
_ => throw new JsonException($"Unknown ExtractionMethod value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ExtractionMethod value, JsonSerializerOptions options)
{
var str = value switch
{
ExtractionMethod.Native => "native",
ExtractionMethod.Ocr => "ocr",
ExtractionMethod.Mixed => "mixed",
_ => throw new JsonException($"Unknown ExtractionMethod value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,332 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// General extraction result used by the core extraction API.
///
/// This is the main result type returned by all extraction functions.
/// </summary>
public sealed record ExtractionResult
{
[JsonPropertyName("content")]
public string Content { get; init; } = "";
[JsonPropertyName("mime_type")]
public string MimeType { get; init; } = "";
[JsonPropertyName("metadata")]
public Metadata Metadata { get; init; } = default!;
/// <summary>
/// Extraction strategy used to produce the returned text.
///
/// Populated when the extractor can reliably distinguish native text extraction,
/// OCR-only extraction, or mixed native/OCR output.
/// </summary>
[JsonPropertyName("extraction_method")]
public ExtractionMethod? ExtractionMethod { get; init; } = null;
[JsonPropertyName("tables")]
public List<Table> Tables { get; init; } = [];
[JsonPropertyName("detected_languages")]
public List<string>? DetectedLanguages { get; init; } = null;
/// <summary>
/// Text chunks when chunking is enabled.
///
/// When chunking configuration is provided, the content is split into
/// overlapping chunks for efficient processing. Each chunk contains the text,
/// optional embeddings (if enabled), and metadata about its position.
/// </summary>
[JsonPropertyName("chunks")]
public List<Chunk>? Chunks { get; init; } = null;
/// <summary>
/// Extracted images from the document.
///
/// When image extraction is enabled via `ImageExtractionConfig`, this field
/// contains all images found in the document with their raw data and metadata.
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
/// </summary>
[JsonPropertyName("images")]
public List<ExtractedImage>? Images { get; init; } = null;
/// <summary>
/// Per-page content when page extraction is enabled.
///
/// When page extraction is configured, the document is split into per-page content
/// with tables and images mapped to their respective pages.
/// </summary>
[JsonPropertyName("pages")]
public List<PageContent>? Pages { get; init; } = null;
/// <summary>
/// Semantic elements when element-based result format is enabled.
///
/// When result_format is set to ElementBased, this field contains semantic
/// elements with type classification, unique identifiers, and metadata for
/// Unstructured-compatible element-based processing.
/// </summary>
[JsonPropertyName("elements")]
public List<Element>? Elements { get; init; } = null;
/// <summary>
/// Rich Djot content structure (when extracting Djot documents).
///
/// When extracting Djot documents with structured extraction enabled,
/// this field contains the full semantic structure including:
/// - Block-level elements with nesting
/// - Inline formatting with attributes
/// - Links, images, footnotes
/// - Math expressions
/// - Complete attribute information
///
/// The `content` field still contains plain text for backward compatibility.
///
/// Always `None` for non-Djot documents.
/// </summary>
[JsonPropertyName("djot_content")]
public DjotContent? DjotContent { get; init; } = null;
/// <summary>
/// OCR elements with full spatial and confidence metadata.
///
/// When OCR is performed with element extraction enabled, this field contains
/// the structured representation of detected text including:
/// - Bounding geometry (rectangles or quadrilaterals)
/// - Confidence scores (detection and recognition)
/// - Rotation information
/// - Hierarchical relationships (Tesseract only)
///
/// This field preserves all metadata that would otherwise be lost when
/// converting to plain text or markdown output formats.
///
/// Only populated when `OcrElementConfig.include_elements` is true.
/// </summary>
[JsonPropertyName("ocr_elements")]
public List<OcrElement>? OcrElements { get; init; } = null;
/// <summary>
/// Structured document tree (when document structure extraction is enabled).
///
/// When `include_document_structure` is true in `ExtractionConfig`, this field
/// contains the full hierarchical representation of the document including:
/// - Heading-driven section nesting
/// - Table grids with cell-level metadata
/// - Content layer classification (body, header, footer, footnote)
/// - Inline text annotations (formatting, links)
/// - Bounding boxes and page numbers
///
/// Independent of `result_format` — can be combined with Unified or ElementBased.
/// </summary>
[JsonPropertyName("document")]
public DocumentStructure? Document { get; init; } = null;
/// <summary>
/// Extracted keywords when keyword extraction is enabled.
///
/// When keyword extraction (RAKE or YAKE) is configured, this field contains
/// the extracted keywords with scores, algorithm info, and position data.
/// Previously stored in `metadata.additional["keywords"]`.
/// </summary>
[JsonPropertyName("extracted_keywords")]
public List<Keyword>? ExtractedKeywords { get; init; } = null;
/// <summary>
/// Document quality score from quality analysis.
///
/// A value between 0.0 and 1.0 indicating the overall text quality.
/// Previously stored in `metadata.additional["quality_score"]`.
/// </summary>
[JsonPropertyName("quality_score")]
public double? QualityScore { get; init; } = null;
/// <summary>
/// Non-fatal warnings collected during processing pipeline stages.
///
/// Captures errors from optional pipeline features (embedding, chunking,
/// language detection, output formatting) that don't prevent extraction
/// but may indicate degraded results.
/// Previously stored as individual keys in `metadata.additional`.
/// </summary>
[JsonPropertyName("processing_warnings")]
public List<ProcessingWarning> ProcessingWarnings { get; init; } = [];
/// <summary>
/// PDF annotations extracted from the document.
///
/// When annotation extraction is enabled via `PdfConfig.extract_annotations`,
/// this field contains text notes, highlights, links, stamps, and other
/// annotations found in PDF documents.
/// </summary>
[JsonPropertyName("annotations")]
public List<PdfAnnotation>? Annotations { get; init; } = null;
/// <summary>
/// Nested extraction results from archive contents.
///
/// When extracting archives, each processable file inside produces its own
/// full extraction result. Set to `None` for non-archive formats.
/// Use `max_archive_depth` in config to control recursion depth.
/// </summary>
[JsonPropertyName("children")]
public List<ArchiveEntry>? Children { get; init; } = null;
/// <summary>
/// URIs/links discovered during document extraction.
///
/// Contains hyperlinks, image references, citations, email addresses, and
/// other URI-like references found in the document. Always extracted when
/// present in the source document.
/// </summary>
[JsonPropertyName("uris")]
public List<ExtractedUri>? Uris { get; init; } = null;
/// <summary>
/// Tracked changes embedded in the source document.
///
/// Populated by per-format extractors that understand change-tracking
/// metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
/// …). Every extractor defaults to `None` until its format-specific
/// implementation is added. Extractors that do populate this field follow
/// the "accepted-changes" convention: inserted text is present in
/// `content`, deleted text is absent — the revision list is the separate
/// audit trail.
/// </summary>
[JsonPropertyName("revisions")]
public List<DocumentRevision>? Revisions { get; init; } = null;
/// <summary>
/// Structured extraction output from LLM-based JSON schema extraction.
///
/// When `structured_extraction` is configured in `ExtractionConfig`, the
/// extracted document content is sent to a VLM with the provided JSON schema.
/// The response is parsed and stored here as a JSON value matching the schema.
/// </summary>
[JsonPropertyName("structured_output")]
public string? StructuredOutput { get; init; } = null;
/// <summary>
/// Code intelligence results from tree-sitter analysis.
///
/// Populated when extracting source code files with the `tree-sitter` feature.
/// Contains metrics, structural analysis, imports/exports, comments,
/// docstrings, symbols, diagnostics, and optionally chunked code segments.
///
/// Stored as an opaque JSON value so that all language bindings (Go, Java,
/// C#, …) can deserialize it as a raw JSON object rather than a typed struct.
/// The underlying type is `tree_sitter_language_pack.ProcessResult`.
/// </summary>
[JsonPropertyName("code_intelligence")]
public string? CodeIntelligence { get; init; } = null;
/// <summary>
/// LLM token usage and cost data for all LLM calls made during this extraction.
///
/// Contains one entry per LLM call. Multiple entries are produced when
/// VLM OCR, structured extraction, or LLM embeddings run during
/// the same extraction.
///
/// `None` when no LLM was used.
/// </summary>
[JsonPropertyName("llm_usage")]
public List<LlmUsage>? LlmUsage { get; init; } = null;
/// <summary>
/// Pre-rendered content in the requested output format.
///
/// Populated during `derive_extraction_result` before tree derivation consumes
/// element data. `apply_output_format` swaps this into `content` at the end
/// of the pipeline, after post-processors have operated on plain text.
/// </summary>
[JsonPropertyName("formatted_content")]
public string? FormattedContent { get; init; } = null;
/// <summary>
/// Structured hOCR document for the OCR+layout pipeline.
///
/// When tesseract produces hOCR output, the parsed `InternalDocument` carries
/// paragraph structure with bounding boxes and confidence scores. The layout
/// classification step enriches these elements before final rendering.
/// </summary>
[JsonPropertyName("ocr_internal_document")]
public string? OcrInternalDocument { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExtractionResult"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractionResult FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractionResult>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractionResult from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractionResult from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>
/// Convert from an OCR result.
/// </summary>
public static ExtractionResult FromOcr(OcrExtractionResult ocr)
{
var ocrJson = JsonSerializer.Serialize(ocr, JsonSerializationOptions);
var ocrHandle = NativeMethods.OcrExtractionResultFromJson(ocrJson);
if (ocrHandle == IntPtr.Zero)
{
var ec = NativeMethods.LastErrorCode();
var ctxPtr = NativeMethods.LastErrorContext();
var msg = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(ctxPtr) ?? "OcrExtractionResultFromJson failed";
throw new KreuzbergException(ec, msg);
}
try
{
var nativeResult = NativeMethods.ExtractionResultFromOcr(ocrHandle);
var jsonPtr = NativeMethods.ExtractionResultToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ExtractionResultFree(nativeResult);
return JsonSerializer.Deserialize<ExtractionResult>(json ?? "null", JsonOptions)!;
}
finally
{
if (ocrHandle != global::System.IntPtr.Zero) NativeMethods.OcrExtractionResultFree(ocrHandle);
}
}
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// FictionBook (FB2) metadata.
/// </summary>
public sealed record FictionBookMetadata
{
[JsonPropertyName("genres")]
public List<string> Genres { get; init; } = [];
[JsonPropertyName("sequences")]
public List<string> Sequences { get; init; } = [];
[JsonPropertyName("annotation")]
public string? Annotation { get; init; } = null;
/// <summary>
/// Parse a <see cref="FictionBookMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static FictionBookMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<FictionBookMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse FictionBookMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse FictionBookMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,210 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Per-file extraction configuration overrides for batch processing.
///
/// All fields are `Option&lt;T&gt;` — `None` means "use the batch-level default."
/// This type is used with `batch_extract_files` and
/// `batch_extract_bytes` to allow heterogeneous
/// extraction settings within a single batch.
///
/// # Excluded Fields
///
/// The following `ExtractionConfig` fields are batch-level only and
/// cannot be overridden per file:
/// - `max_concurrent_extractions` — controls batch parallelism
/// - `use_cache` — global caching policy
/// - `acceleration` — shared ONNX execution provider
/// - `security_limits` — global archive security policy
/// </summary>
public sealed record FileExtractionConfig
{
/// <summary>
/// Override quality post-processing for this file.
/// </summary>
[JsonPropertyName("enable_quality_processing")]
public bool? EnableQualityProcessing { get; init; } = null;
/// <summary>
/// Override OCR configuration for this file (null in the Option = use batch default).
/// </summary>
[JsonPropertyName("ocr")]
public OcrConfig? Ocr { get; init; } = null;
/// <summary>
/// Override force OCR for this file.
/// </summary>
[JsonPropertyName("force_ocr")]
public bool? ForceOcr { get; init; } = null;
/// <summary>
/// Override force OCR pages for this file (1-indexed page numbers).
/// </summary>
[JsonPropertyName("force_ocr_pages")]
public List<uint>? ForceOcrPages { get; init; } = null;
/// <summary>
/// Override disable OCR for this file.
/// </summary>
[JsonPropertyName("disable_ocr")]
public bool? DisableOcr { get; init; } = null;
/// <summary>
/// Override chunking configuration for this file.
/// </summary>
[JsonPropertyName("chunking")]
public ChunkingConfig? Chunking { get; init; } = null;
/// <summary>
/// Override content filtering configuration for this file.
/// </summary>
[JsonPropertyName("content_filter")]
public ContentFilterConfig? ContentFilter { get; init; } = null;
/// <summary>
/// Override image extraction configuration for this file.
/// </summary>
[JsonPropertyName("images")]
public ImageExtractionConfig? Images { get; init; } = null;
/// <summary>
/// Override PDF options for this file.
/// </summary>
[JsonPropertyName("pdf_options")]
public PdfConfig? PdfOptions { get; init; } = null;
/// <summary>
/// Override token reduction for this file.
/// </summary>
[JsonPropertyName("token_reduction")]
public TokenReductionOptions? TokenReduction { get; init; } = null;
/// <summary>
/// Override language detection for this file.
/// </summary>
[JsonPropertyName("language_detection")]
public LanguageDetectionConfig? LanguageDetection { get; init; } = null;
/// <summary>
/// Override page extraction for this file.
/// </summary>
[JsonPropertyName("pages")]
public PageConfig? Pages { get; init; } = null;
/// <summary>
/// Override keyword extraction for this file.
/// </summary>
[JsonPropertyName("keywords")]
public KeywordConfig? Keywords { get; init; } = null;
/// <summary>
/// Override post-processor for this file.
/// </summary>
[JsonPropertyName("postprocessor")]
public PostProcessorConfig? Postprocessor { get; init; } = null;
/// <summary>
/// Override HTML conversion options for this file.
/// </summary>
[JsonPropertyName("html_options")]
public string? HtmlOptions { get; init; } = null;
/// <summary>
/// Override result format for this file.
/// </summary>
[JsonPropertyName("result_format")]
public ResultFormat? ResultFormat { get; init; } = null;
/// <summary>
/// Override output content format for this file.
/// </summary>
[JsonPropertyName("output_format")]
public OutputFormat? OutputFormat { get; init; } = null;
/// <summary>
/// Override document structure output for this file.
/// </summary>
[JsonPropertyName("include_document_structure")]
public bool? IncludeDocumentStructure { get; init; } = null;
/// <summary>
/// Override layout detection for this file.
/// </summary>
[JsonPropertyName("layout")]
public LayoutDetectionConfig? Layout { get; init; } = null;
/// <summary>
/// Override per-file extraction timeout in seconds.
///
/// When set, the extraction for this file will be canceled after the
/// specified duration. A timed-out file produces an error result without
/// affecting other files in the batch.
/// </summary>
[JsonPropertyName("timeout_secs")]
public ulong? TimeoutSecs { get; init; } = null;
/// <summary>
/// Override tree-sitter configuration for this file.
/// </summary>
[JsonPropertyName("tree_sitter")]
public TreeSitterConfig? TreeSitter { get; init; } = null;
/// <summary>
/// Override structured extraction configuration for this file.
///
/// When set, enables LLM-based structured extraction with a JSON schema
/// for this specific file. The extracted content is sent to a VLM/LLM
/// and the response is parsed according to the provided schema.
/// </summary>
[JsonPropertyName("structured_extraction")]
public StructuredExtractionConfig? StructuredExtraction { get; init; } = null;
/// <summary>
/// Parse a <see cref="FileExtractionConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static FileExtractionConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<FileExtractionConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse FileExtractionConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse FileExtractionConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Footnote in Djot.
/// </summary>
public sealed record Footnote
{
/// <summary>
/// Footnote label
/// </summary>
[JsonPropertyName("label")]
public required string Label { get; init; }
/// <summary>
/// Footnote content blocks
/// </summary>
[JsonPropertyName("content")]
public List<FormattedBlock> Content { get; init; } = [];
/// <summary>
/// Parse a <see cref="Footnote"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static Footnote FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<Footnote>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse Footnote from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse Footnote from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,294 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Format-specific metadata (discriminated union).
///
/// Only one format type can exist per extraction result. This provides
/// type-safe, clean metadata without nested optionals.
/// </summary>
[JsonConverter(typeof(FormatMetadataJsonConverter))]
public abstract record FormatMetadata
{
public sealed record Pdf(
PdfMetadata Value
) : FormatMetadata;
public sealed record Docx(
DocxMetadata Value
) : FormatMetadata;
public sealed record Excel(
ExcelMetadata Value
) : FormatMetadata;
public sealed record Email(
EmailMetadata Value
) : FormatMetadata;
public sealed record Pptx(
PptxMetadata Value
) : FormatMetadata;
public sealed record Archive(
ArchiveMetadata Value
) : FormatMetadata;
public sealed record Image(
ImageMetadata Value
) : FormatMetadata;
public sealed record Xml(
XmlMetadata Value
) : FormatMetadata;
public sealed record Text(
TextMetadata Value
) : FormatMetadata;
public sealed record Html(
HtmlMetadata Value
) : FormatMetadata;
public sealed record Ocr(
OcrMetadata Value
) : FormatMetadata;
public sealed record Csv(
CsvMetadata Value
) : FormatMetadata;
public sealed record Bibtex(
BibtexMetadata Value
) : FormatMetadata;
public sealed record Citation(
CitationMetadata Value
) : FormatMetadata;
public sealed record FictionBook(
FictionBookMetadata Value
) : FormatMetadata;
public sealed record Dbf(
DbfMetadata Value
) : FormatMetadata;
public sealed record Jats(
JatsMetadata Value
) : FormatMetadata;
public sealed record Epub(
EpubMetadata Value
) : FormatMetadata;
public sealed record Pst(
PstMetadata Value
) : FormatMetadata;
public sealed record Code(
object Value
) : FormatMetadata;
/// <summary>Returns the Pdf data if this is a Pdf variant, otherwise null.</summary>
public PdfMetadata? AsPdf => this is Pdf e ? e.Value : null;
/// <summary>Returns the Docx data if this is a Docx variant, otherwise null.</summary>
public DocxMetadata? AsDocx => this is Docx e ? e.Value : null;
/// <summary>Returns the Excel data if this is a Excel variant, otherwise null.</summary>
public ExcelMetadata? AsExcel => this is Excel e ? e.Value : null;
/// <summary>Returns the Email data if this is a Email variant, otherwise null.</summary>
public EmailMetadata? AsEmail => this is Email e ? e.Value : null;
/// <summary>Returns the Pptx data if this is a Pptx variant, otherwise null.</summary>
public PptxMetadata? AsPptx => this is Pptx e ? e.Value : null;
/// <summary>Returns the Archive data if this is a Archive variant, otherwise null.</summary>
public ArchiveMetadata? AsArchive => this is Archive e ? e.Value : null;
/// <summary>Returns the Image data if this is a Image variant, otherwise null.</summary>
public ImageMetadata? AsImage => this is Image e ? e.Value : null;
/// <summary>Returns the Xml data if this is a Xml variant, otherwise null.</summary>
public XmlMetadata? AsXml => this is Xml e ? e.Value : null;
/// <summary>Returns the Text data if this is a Text variant, otherwise null.</summary>
public TextMetadata? AsText => this is Text e ? e.Value : null;
/// <summary>Returns the Html data if this is a Html variant, otherwise null.</summary>
public HtmlMetadata? AsHtml => this is Html e ? e.Value : null;
/// <summary>Returns the Ocr data if this is a Ocr variant, otherwise null.</summary>
public OcrMetadata? AsOcr => this is Ocr e ? e.Value : null;
/// <summary>Returns the Csv data if this is a Csv variant, otherwise null.</summary>
public CsvMetadata? AsCsv => this is Csv e ? e.Value : null;
/// <summary>Returns the Bibtex data if this is a Bibtex variant, otherwise null.</summary>
public BibtexMetadata? AsBibtex => this is Bibtex e ? e.Value : null;
/// <summary>Returns the Citation data if this is a Citation variant, otherwise null.</summary>
public CitationMetadata? AsCitation => this is Citation e ? e.Value : null;
/// <summary>Returns the FictionBook data if this is a FictionBook variant, otherwise null.</summary>
public FictionBookMetadata? AsFictionBook => this is FictionBook e ? e.Value : null;
/// <summary>Returns the Dbf data if this is a Dbf variant, otherwise null.</summary>
public DbfMetadata? AsDbf => this is Dbf e ? e.Value : null;
/// <summary>Returns the Jats data if this is a Jats variant, otherwise null.</summary>
public JatsMetadata? AsJats => this is Jats e ? e.Value : null;
/// <summary>Returns the Epub data if this is a Epub variant, otherwise null.</summary>
public EpubMetadata? AsEpub => this is Epub e ? e.Value : null;
/// <summary>Returns the Pst data if this is a Pst variant, otherwise null.</summary>
public PstMetadata? AsPst => this is Pst e ? e.Value : null;
/// <summary>Returns the Code data if this is a Code variant, otherwise null.</summary>
public object? AsCode => this is Code e ? e.Value : null;
}
/// <summary>
/// Custom converter for FormatMetadata sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (format_type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class FormatMetadataJsonConverter : JsonConverter<FormatMetadata>
{
public override FormatMetadata Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("format_type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: format_type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "format_type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "format_type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "pdf" => JsonSerializer.Deserialize<FormatMetadata.Pdf>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "docx" => JsonSerializer.Deserialize<FormatMetadata.Docx>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "excel" => JsonSerializer.Deserialize<FormatMetadata.Excel>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "email" => JsonSerializer.Deserialize<FormatMetadata.Email>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "pptx" => JsonSerializer.Deserialize<FormatMetadata.Pptx>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "archive" => JsonSerializer.Deserialize<FormatMetadata.Archive>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "image" => JsonSerializer.Deserialize<FormatMetadata.Image>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "xml" => JsonSerializer.Deserialize<FormatMetadata.Xml>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "text" => JsonSerializer.Deserialize<FormatMetadata.Text>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "html" => JsonSerializer.Deserialize<FormatMetadata.Html>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "ocr" => JsonSerializer.Deserialize<FormatMetadata.Ocr>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "csv" => JsonSerializer.Deserialize<FormatMetadata.Csv>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "bibtex" => JsonSerializer.Deserialize<FormatMetadata.Bibtex>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "citation" => JsonSerializer.Deserialize<FormatMetadata.Citation>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "fiction_book" => JsonSerializer.Deserialize<FormatMetadata.FictionBook>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "dbf" => JsonSerializer.Deserialize<FormatMetadata.Dbf>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "jats" => JsonSerializer.Deserialize<FormatMetadata.Jats>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "epub" => JsonSerializer.Deserialize<FormatMetadata.Epub>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "pst" => JsonSerializer.Deserialize<FormatMetadata.Pst>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "code" => JsonSerializer.Deserialize<FormatMetadata.Code>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown FormatMetadata discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, FormatMetadata value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"format_type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field format_type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case FormatMetadata.Pdf v_pdf:
tag = "pdf"; inner = v_pdf.Value; break; case FormatMetadata.Docx v_docx:
tag = "docx"; inner = v_docx.Value; break; case FormatMetadata.Excel v_excel:
tag = "excel"; inner = v_excel.Value; break; case FormatMetadata.Email v_email:
tag = "email"; inner = v_email.Value; break; case FormatMetadata.Pptx v_pptx:
tag = "pptx"; inner = v_pptx.Value; break; case FormatMetadata.Archive v_archive:
tag = "archive"; inner = v_archive.Value; break; case FormatMetadata.Image v_image:
tag = "image"; inner = v_image.Value; break; case FormatMetadata.Xml v_xml:
tag = "xml"; inner = v_xml.Value; break; case FormatMetadata.Text v_text:
tag = "text"; inner = v_text.Value; break; case FormatMetadata.Html v_html:
tag = "html"; inner = v_html.Value; break; case FormatMetadata.Ocr v_ocr:
tag = "ocr"; inner = v_ocr.Value; break; case FormatMetadata.Csv v_csv:
tag = "csv"; inner = v_csv.Value; break; case FormatMetadata.Bibtex v_bibtex:
tag = "bibtex"; inner = v_bibtex.Value; break; case FormatMetadata.Citation v_citation:
tag = "citation"; inner = v_citation.Value; break; case FormatMetadata.FictionBook v_fictionbook:
tag = "fiction_book"; inner = v_fictionbook.Value; break; case FormatMetadata.Dbf v_dbf:
tag = "dbf"; inner = v_dbf.Value; break; case FormatMetadata.Jats v_jats:
tag = "jats"; inner = v_jats.Value; break; case FormatMetadata.Epub v_epub:
tag = "epub"; inner = v_epub.Value; break; case FormatMetadata.Pst v_pst:
tag = "pst"; inner = v_pst.Value; break; case FormatMetadata.Code v_code:
tag = "code"; inner = v_code.Value; break; default:
throw new JsonException($"Unknown FormatMetadata variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("format_type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,100 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Block-level element in a Djot document.
///
/// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
/// </summary>
public sealed record FormattedBlock
{
/// <summary>
/// Type of block element
/// </summary>
[JsonPropertyName("block_type")]
public required BlockType BlockType { get; init; }
/// <summary>
/// Heading level (1-6) for headings, or nesting level for lists
/// </summary>
[JsonPropertyName("level")]
public ulong? Level { get; init; } = null;
/// <summary>
/// Inline content within the block
/// </summary>
[JsonPropertyName("inline_content")]
public List<InlineElement> InlineContent { get; init; } = [];
/// <summary>
/// Element attributes (classes, IDs, key-value pairs)
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Language identifier for code blocks
/// </summary>
[JsonPropertyName("language")]
public string? Language { get; init; } = null;
/// <summary>
/// Raw code content for code blocks
/// </summary>
[JsonPropertyName("code")]
public string? Code { get; init; } = null;
/// <summary>
/// Nested blocks for containers (blockquotes, list items, divs)
/// </summary>
[JsonPropertyName("children")]
public List<FormattedBlock> Children { get; init; } = [];
/// <summary>
/// Parse a <see cref="FormattedBlock"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static FormattedBlock FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<FormattedBlock>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse FormattedBlock from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse FormattedBlock from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,98 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Individual grid cell with position and span metadata.
/// </summary>
public sealed record GridCell
{
/// <summary>
/// Cell text content.
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// Zero-indexed row position.
/// </summary>
[JsonPropertyName("row")]
public uint Row { get; init; } = 0;
/// <summary>
/// Zero-indexed column position.
/// </summary>
[JsonPropertyName("col")]
public uint Col { get; init; } = 0;
/// <summary>
/// Number of rows this cell spans.
/// </summary>
[JsonPropertyName("row_span")]
public uint RowSpan { get; init; } = 0;
/// <summary>
/// Number of columns this cell spans.
/// </summary>
[JsonPropertyName("col_span")]
public uint ColSpan { get; init; } = 0;
/// <summary>
/// Whether this is a header cell.
/// </summary>
[JsonPropertyName("is_header")]
public bool IsHeader { get; init; } = false;
/// <summary>
/// Bounding box for this cell (if available).
/// </summary>
[JsonPropertyName("bbox")]
public BoundingBox? Bbox { get; init; } = null;
/// <summary>
/// Parse a <see cref="GridCell"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static GridCell FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<GridCell>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse GridCell from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse GridCell from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Header/heading element metadata.
/// </summary>
public sealed record HeaderMetadata
{
/// <summary>
/// Header level: 1 (h1) through 6 (h6)
/// </summary>
[JsonPropertyName("level")]
public byte Level { get; init; } = 0;
/// <summary>
/// Normalized text content of the header
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// HTML id attribute if present
/// </summary>
[JsonPropertyName("id")]
public string? Id { get; init; } = null;
/// <summary>
/// Document tree depth at the header element
/// </summary>
[JsonPropertyName("depth")]
public uint Depth { get; init; } = 0;
/// <summary>
/// Byte offset in original HTML document
/// </summary>
[JsonPropertyName("html_offset")]
public uint HtmlOffset { get; init; } = 0;
/// <summary>
/// Parse a <see cref="HeaderMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HeaderMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HeaderMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HeaderMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HeaderMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Heading context for a chunk within a Markdown document.
///
/// Contains the heading hierarchy from document root to this chunk's section.
/// </summary>
public sealed record HeadingContext
{
/// <summary>
/// The heading hierarchy from document root to this chunk's section.
/// Index 0 is the outermost (h1), last element is the most specific.
/// </summary>
[JsonPropertyName("headings")]
public List<HeadingLevel> Headings { get; init; } = [];
/// <summary>
/// Parse a <see cref="HeadingContext"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HeadingContext FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HeadingContext>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HeadingContext from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HeadingContext from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single heading in the hierarchy.
/// </summary>
public sealed record HeadingLevel
{
/// <summary>
/// Heading depth (1 = h1, 2 = h2, etc.)
/// </summary>
[JsonPropertyName("level")]
public byte Level { get; init; } = 0;
/// <summary>
/// The text content of the heading.
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// Parse a <see cref="HeadingLevel"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HeadingLevel FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HeadingLevel>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HeadingLevel from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HeadingLevel from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,94 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A text block with hierarchy level assignment.
///
/// Represents a block of text with semantic heading information extracted from
/// font size clustering and hierarchical analysis.
/// </summary>
public sealed record HierarchicalBlock
{
/// <summary>
/// The text content of this block
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// The font size of the text in this block
/// </summary>
[JsonPropertyName("font_size")]
public float FontSize { get; init; } = 0.0f;
/// <summary>
/// The hierarchy level of this block (H1-H6 or Body)
///
/// Levels correspond to HTML heading tags:
/// - "h1": Top-level heading
/// - "h2": Secondary heading
/// - "h3": Tertiary heading
/// - "h4": Quaternary heading
/// - "h5": Quinary heading
/// - "h6": Senary heading
/// - "body": Body text (no heading level)
/// </summary>
[JsonPropertyName("level")]
public required string Level { get; init; }
/// <summary>
/// Bounding box information for the block
///
/// Contains coordinates as (left, top, right, bottom) in PDF units.
/// </summary>
[JsonPropertyName("bbox")]
public List<float>? Bbox { get; init; } = null;
/// <summary>
/// Parse a <see cref="HierarchicalBlock"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HierarchicalBlock FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HierarchicalBlock>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HierarchicalBlock from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HierarchicalBlock from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,101 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Hierarchy extraction configuration for PDF text structure analysis.
///
/// Enables extraction of document hierarchy levels (H1-H6) based on font size
/// clustering and semantic analysis. When enabled, hierarchical blocks are
/// included in page content.
/// </summary>
public sealed record HierarchyConfig
{
/// <summary>
/// Enable hierarchy extraction
/// </summary>
[JsonPropertyName("enabled")]
public bool Enabled { get; init; } = true;
/// <summary>
/// Number of font size clusters to use for hierarchy levels (1-7)
///
/// Default: 6, which provides H1-H6 heading levels with body text.
/// Larger values create more fine-grained hierarchy levels.
/// </summary>
[JsonPropertyName("k_clusters")]
public ulong KClusters { get; init; } = 3;
/// <summary>
/// Include bounding box information in hierarchy blocks
/// </summary>
[JsonPropertyName("include_bbox")]
public bool IncludeBbox { get; init; } = true;
/// <summary>
/// OCR coverage threshold for smart OCR triggering (0.0-1.0)
///
/// Determines when OCR should be triggered based on text block coverage.
/// OCR is triggered when text blocks cover less than this fraction of the page.
/// Default: 0.5 (trigger OCR if less than 50% of page has text)
/// </summary>
[JsonPropertyName("ocr_coverage_threshold")]
public float? OcrCoverageThreshold { get; init; } = null;
/// <summary>
/// Parse a <see cref="HierarchyConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HierarchyConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HierarchyConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HierarchyConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HierarchyConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static HierarchyConfig Default()
{
var nativeResult = NativeMethods.HierarchyConfigDefault();
var jsonPtr = NativeMethods.HierarchyConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.HierarchyConfigFree(nativeResult);
return JsonSerializer.Deserialize<HierarchyConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,153 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// HTML metadata extracted from HTML documents.
///
/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
/// and extracted structural elements (headers, links, images, structured data).
/// </summary>
public sealed record HtmlMetadata
{
/// <summary>
/// Document title from `&lt;title&gt;` tag
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Document description from `&lt;meta name="description"&gt;` tag
/// </summary>
[JsonPropertyName("description")]
public string? Description { get; init; } = null;
/// <summary>
/// Document keywords from `&lt;meta name="keywords"&gt;` tag, split on commas
/// </summary>
[JsonPropertyName("keywords")]
public List<string> Keywords { get; init; } = [];
/// <summary>
/// Document author from `&lt;meta name="author"&gt;` tag
/// </summary>
[JsonPropertyName("author")]
public string? Author { get; init; } = null;
/// <summary>
/// Canonical URL from `&lt;link rel="canonical"&gt;` tag
/// </summary>
[JsonPropertyName("canonical_url")]
public string? CanonicalUrl { get; init; } = null;
/// <summary>
/// Base URL from `&lt;base href=""&gt;` tag for resolving relative URLs
/// </summary>
[JsonPropertyName("base_href")]
public string? BaseHref { get; init; } = null;
/// <summary>
/// Document language from `lang` attribute
/// </summary>
[JsonPropertyName("language")]
public string? Language { get; init; } = null;
/// <summary>
/// Document text direction from `dir` attribute
/// </summary>
[JsonConverter(typeof(TextDirectionJsonConverter))]
[JsonPropertyName("text_direction")]
public TextDirection? TextDirection { get; init; } = null;
/// <summary>
/// Open Graph metadata (og:* properties) for social media
/// Keys like "title", "description", "image", "url", etc.
/// </summary>
[JsonPropertyName("open_graph")]
public Dictionary<string, string> OpenGraph { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Twitter Card metadata (twitter:* properties)
/// Keys like "card", "site", "creator", "title", "description", "image", etc.
/// </summary>
[JsonPropertyName("twitter_card")]
public Dictionary<string, string> TwitterCard { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Additional meta tags not covered by specific fields
/// Keys are meta name/property attributes, values are content
/// </summary>
[JsonPropertyName("meta_tags")]
public Dictionary<string, string> MetaTags { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Extracted header elements with hierarchy
/// </summary>
[JsonPropertyName("headers")]
public List<HeaderMetadata> Headers { get; init; } = [];
/// <summary>
/// Extracted hyperlinks with type classification
/// </summary>
[JsonPropertyName("links")]
public List<LinkMetadata> Links { get; init; } = [];
/// <summary>
/// Extracted images with source and dimensions
/// </summary>
[JsonPropertyName("images")]
public List<ImageMetadataType> Images { get; init; } = [];
/// <summary>
/// Extracted structured data blocks
/// </summary>
[JsonPropertyName("structured_data")]
public List<StructuredData> StructuredData { get; init; } = [];
/// <summary>
/// Parse a <see cref="HtmlMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HtmlMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HtmlMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HtmlMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HtmlMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,110 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Configuration for styled HTML output.
///
/// When set on `ExtractionConfig.html_output` alongside
/// `output_format = OutputFormat.Html`, the pipeline builds a
/// `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer) instead of
/// the plain comrak-based renderer.
/// </summary>
public sealed record HtmlOutputConfig
{
/// <summary>
/// Inline CSS string injected into the output after the theme stylesheet.
/// Concatenated after `css_file` content when both are set.
/// </summary>
[JsonPropertyName("css")]
public string? Css { get; init; } = null;
/// <summary>
/// Path to a CSS file loaded once at renderer construction time.
/// Concatenated before `css` when both are set.
/// </summary>
[JsonPropertyName("css_file")]
public string? CssFile { get; init; } = null;
/// <summary>
/// Built-in colour/typography theme. Default: `HtmlTheme.Unstyled`.
/// </summary>
[JsonPropertyName("theme")]
public HtmlTheme Theme { get; init; } = HtmlTheme.Unstyled;
/// <summary>
/// CSS class prefix applied to every emitted class name.
///
/// Default: `"kb-"`. Change this if your host application already uses
/// classes that start with `kb-`.
/// </summary>
[JsonPropertyName("class_prefix")]
public string ClassPrefix { get; init; } = "";
/// <summary>
/// When `true` (default), write the resolved CSS into a `&lt;style&gt;` block
/// immediately after the opening `&lt;div class="{prefix}doc"&gt;`.
///
/// Set to `false` to emit only the structural markup and wire up your
/// own stylesheet targeting the `kb-*` class names.
/// </summary>
[JsonPropertyName("embed_css")]
public bool EmbedCss { get; init; } = true;
/// <summary>
/// Parse a <see cref="HtmlOutputConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HtmlOutputConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HtmlOutputConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HtmlOutputConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HtmlOutputConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static HtmlOutputConfig Default()
{
var nativeResult = NativeMethods.HtmlOutputConfigDefault();
var jsonPtr = NativeMethods.HtmlOutputConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.HtmlOutputConfigFree(nativeResult);
return JsonSerializer.Deserialize<HtmlOutputConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,79 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Built-in HTML theme selection.
/// </summary>
[JsonConverter(typeof(HtmlThemeJsonConverter))]
public enum HtmlTheme
{
/// <summary>
/// Sensible defaults: system font stack, neutral colours, readable line
/// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
/// can override individual values.
/// </summary>
[JsonPropertyName("default")]
Default,
/// <summary>
/// GitHub Markdown-inspired palette and spacing.
/// </summary>
[JsonPropertyName("github")]
GitHub,
/// <summary>
/// Dark background, light text.
/// </summary>
[JsonPropertyName("dark")]
Dark,
/// <summary>
/// Minimal light theme with generous whitespace.
/// </summary>
[JsonPropertyName("light")]
Light,
/// <summary>
/// No built-in stylesheet emitted. CSS custom properties are still defined
/// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
/// </summary>
[JsonPropertyName("unstyled")]
Unstyled,
}
/// <summary>
/// Custom JSON converter for <see cref="HtmlTheme"/> that respects explicit variant names.
/// </summary>
internal sealed class HtmlThemeJsonConverter : JsonConverter<HtmlTheme>
{
public override HtmlTheme Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"default" => HtmlTheme.Default,
"github" => HtmlTheme.GitHub,
"dark" => HtmlTheme.Dark,
"light" => HtmlTheme.Light,
"unstyled" => HtmlTheme.Unstyled,
_ => throw new JsonException($"Unknown HtmlTheme value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, HtmlTheme value, JsonSerializerOptions options)
{
var str = value switch
{
HtmlTheme.Default => "default",
HtmlTheme.GitHub => "github",
HtmlTheme.Dark => "dark",
HtmlTheme.Light => "light",
HtmlTheme.Unstyled => "unstyled",
_ => throw new JsonException($"Unknown HtmlTheme value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,172 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image extraction configuration.
/// </summary>
public sealed record ImageExtractionConfig
{
/// <summary>
/// Extract images from documents
/// </summary>
[JsonPropertyName("extract_images")]
public bool ExtractImages { get; init; } = true;
/// <summary>
/// Target DPI for image normalization
/// </summary>
[JsonPropertyName("target_dpi")]
public int TargetDpi { get; init; } = 300;
/// <summary>
/// Maximum dimension for images (width or height)
/// </summary>
[JsonPropertyName("max_image_dimension")]
public int MaxImageDimension { get; init; } = 4096;
/// <summary>
/// Whether to inject image reference placeholders into markdown output.
/// When `true` (default), image references like `![Image 1](embedded:p1_i0)`
/// are appended to the markdown. Set to `false` to extract images as data
/// without polluting the markdown output.
/// </summary>
[JsonPropertyName("inject_placeholders")]
public bool InjectPlaceholders { get; init; } = true;
/// <summary>
/// Automatically adjust DPI based on image content
/// </summary>
[JsonPropertyName("auto_adjust_dpi")]
public bool AutoAdjustDpi { get; init; } = true;
/// <summary>
/// Minimum DPI threshold
/// </summary>
[JsonPropertyName("min_dpi")]
public int MinDpi { get; init; } = 72;
/// <summary>
/// Maximum DPI threshold
/// </summary>
[JsonPropertyName("max_dpi")]
public int MaxDpi { get; init; } = 600;
/// <summary>
/// Maximum number of image objects to extract per PDF page.
///
/// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
/// can trigger extremely long or indefinite extraction times when every image
/// object on a dense page is decoded individually via the PDF extractor. Setting this
/// limit causes kreuzberg to stop collecting individual images once the count
/// per page reaches the cap and emit a warning instead.
///
/// `None` (default) means no limit — all images are extracted.
/// </summary>
[JsonPropertyName("max_images_per_page")]
public uint? MaxImagesPerPage { get; init; } = null;
/// <summary>
/// When `true` (default), extracted images are classified by kind and grouped
/// into clusters where they appear to belong to one figure.
/// </summary>
[JsonPropertyName("classify")]
public bool Classify { get; init; } = true;
/// <summary>
/// When `true`, full-page renders produced during OCR preprocessing are captured
/// and returned as `ImageKind.PageRaster` entries in `ExtractionResult.images`.
///
/// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
/// document-level OCR bypass is active (whole-document backend). When OCR is
/// enabled and this flag is set but the active backend skips per-page rendering,
/// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
///
/// Defaults to `false`. Enable when downstream consumers need page thumbnails
/// (e.g. citation previews, visual grounding).
/// </summary>
[JsonPropertyName("include_page_rasters")]
public bool IncludePageRasters { get; init; } = false;
/// <summary>
/// Run OCR on extracted images and include the recognized text in the document content.
///
/// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
/// are processed with the configured OCR backend. Set to `false` to extract images
/// without OCR processing, even when OCR is enabled.
/// </summary>
[JsonPropertyName("run_ocr_on_images")]
public bool RunOcrOnImages { get; init; } = true;
/// <summary>
/// When `true`, image OCR results are rendered as plain text without the
/// `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
/// is also `true`.
/// </summary>
[JsonPropertyName("ocr_text_only")]
public bool OcrTextOnly { get; init; } = false;
/// <summary>
/// When `true` and `ocr_text_only` is `false`, append the OCR text after
/// the image placeholder in the rendered output.
/// </summary>
[JsonPropertyName("append_ocr_text")]
public bool AppendOcrText { get; init; } = false;
/// <summary>
/// Parse a <see cref="ImageExtractionConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImageExtractionConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImageExtractionConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImageExtractionConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImageExtractionConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ImageExtractionConfig Default()
{
var nativeResult = NativeMethods.ImageExtractionConfigDefault();
var jsonPtr = NativeMethods.ImageExtractionConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ImageExtractionConfigFree(nativeResult);
return JsonSerializer.Deserialize<ImageExtractionConfig>(json ?? "null", JsonOptions)!;
}
}

125
packages/csharp/src/Kreuzberg/ImageKind.cs generated Normal file
View File

@@ -0,0 +1,125 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Heuristic classification of what an image likely depicts.
/// </summary>
[JsonConverter(typeof(ImageKindJsonConverter))]
public enum ImageKind
{
/// <summary>
/// Photographic image (natural scene, photograph)
/// </summary>
[JsonPropertyName("photograph")]
Photograph,
/// <summary>
/// Technical or schematic diagram
/// </summary>
[JsonPropertyName("diagram")]
Diagram,
/// <summary>
/// Chart, graph, or plot
/// </summary>
[JsonPropertyName("chart")]
Chart,
/// <summary>
/// Freehand or technical drawing
/// </summary>
[JsonPropertyName("drawing")]
Drawing,
/// <summary>
/// Text-heavy image (scanned text, document)
/// </summary>
[JsonPropertyName("text_block")]
TextBlock,
/// <summary>
/// Decorative element or border
/// </summary>
[JsonPropertyName("decoration")]
Decoration,
/// <summary>
/// Logo or brand mark
/// </summary>
[JsonPropertyName("logo")]
Logo,
/// <summary>
/// Small icon
/// </summary>
[JsonPropertyName("icon")]
Icon,
/// <summary>
/// Fragment of a larger tiled image (tile of a technical drawing)
/// </summary>
[JsonPropertyName("tile_fragment")]
TileFragment,
/// <summary>
/// Mask or transparency map
/// </summary>
[JsonPropertyName("mask")]
Mask,
/// <summary>
/// Full-page render produced during OCR preprocessing; used as a citation thumbnail.
/// </summary>
[JsonPropertyName("page_raster")]
PageRaster,
/// <summary>
/// Could not classify with reasonable confidence
/// </summary>
[JsonPropertyName("unknown")]
Unknown,
}
/// <summary>
/// Custom JSON converter for <see cref="ImageKind"/> that respects explicit variant names.
/// </summary>
internal sealed class ImageKindJsonConverter : JsonConverter<ImageKind>
{
public override ImageKind Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"photograph" => ImageKind.Photograph,
"diagram" => ImageKind.Diagram,
"chart" => ImageKind.Chart,
"drawing" => ImageKind.Drawing,
"text_block" => ImageKind.TextBlock,
"decoration" => ImageKind.Decoration,
"logo" => ImageKind.Logo,
"icon" => ImageKind.Icon,
"tile_fragment" => ImageKind.TileFragment,
"mask" => ImageKind.Mask,
"page_raster" => ImageKind.PageRaster,
"unknown" => ImageKind.Unknown,
_ => throw new JsonException($"Unknown ImageKind value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ImageKind value, JsonSerializerOptions options)
{
var str = value switch
{
ImageKind.Photograph => "photograph",
ImageKind.Diagram => "diagram",
ImageKind.Chart => "chart",
ImageKind.Drawing => "drawing",
ImageKind.TextBlock => "text_block",
ImageKind.Decoration => "decoration",
ImageKind.Logo => "logo",
ImageKind.Icon => "icon",
ImageKind.TileFragment => "tile_fragment",
ImageKind.Mask => "mask",
ImageKind.PageRaster => "page_raster",
ImageKind.Unknown => "unknown",
_ => throw new JsonException($"Unknown ImageKind value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,82 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image metadata extracted from image files.
///
/// Includes dimensions, format, and EXIF data.
/// </summary>
public sealed record ImageMetadata
{
/// <summary>
/// Image width in pixels
/// </summary>
[JsonPropertyName("width")]
public uint Width { get; init; } = 0;
/// <summary>
/// Image height in pixels
/// </summary>
[JsonPropertyName("height")]
public uint Height { get; init; } = 0;
/// <summary>
/// Image format (e.g., "PNG", "JPEG", "TIFF")
/// </summary>
[JsonPropertyName("format")]
public string Format { get; init; } = "";
/// <summary>
/// EXIF metadata tags
/// </summary>
[JsonPropertyName("exif")]
public Dictionary<string, string> Exif { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Parse a <see cref="ImageMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImageMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImageMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImageMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImageMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,93 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image element metadata.
/// </summary>
public sealed record ImageMetadataType
{
/// <summary>
/// Image source (URL, data URI, or SVG content)
/// </summary>
[JsonPropertyName("src")]
public required string Src { get; init; }
/// <summary>
/// Alternative text from alt attribute
/// </summary>
[JsonPropertyName("alt")]
public string? Alt { get; init; } = null;
/// <summary>
/// Title attribute
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Image dimensions as (width, height) if available
/// </summary>
[JsonPropertyName("dimensions")]
public List<uint>? Dimensions { get; init; } = null;
/// <summary>
/// Image type classification
/// </summary>
[JsonConverter(typeof(ImageTypeJsonConverter))]
[JsonPropertyName("image_type")]
public required ImageType ImageType { get; init; }
/// <summary>
/// Additional attributes as key-value pairs
/// </summary>
[JsonPropertyName("attributes")]
public List<List<string>> Attributes { get; init; } = [];
/// <summary>
/// Parse a <see cref="ImageMetadataType"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImageMetadataType FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImageMetadataType>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImageMetadataType from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImageMetadataType from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,112 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image preprocessing configuration for OCR.
///
/// These settings control how images are preprocessed before OCR to improve
/// text recognition quality. Different preprocessing strategies work better
/// for different document types.
/// </summary>
public sealed record ImagePreprocessingConfig
{
/// <summary>
/// Target DPI for the image (300 is standard, 600 for small text).
/// </summary>
[JsonPropertyName("target_dpi")]
public int TargetDpi { get; init; } = 300;
/// <summary>
/// Auto-detect and correct image rotation.
/// </summary>
[JsonPropertyName("auto_rotate")]
public bool AutoRotate { get; init; } = true;
/// <summary>
/// Correct skew (tilted images).
/// </summary>
[JsonPropertyName("deskew")]
public bool Deskew { get; init; } = true;
/// <summary>
/// Remove noise from the image.
/// </summary>
[JsonPropertyName("denoise")]
public bool Denoise { get; init; } = false;
/// <summary>
/// Enhance contrast for better text visibility.
/// </summary>
[JsonPropertyName("contrast_enhance")]
public bool ContrastEnhance { get; init; } = false;
/// <summary>
/// Binarization method: "otsu", "sauvola", "adaptive".
/// </summary>
[JsonPropertyName("binarization_method")]
public string BinarizationMethod { get; init; } = "otsu";
/// <summary>
/// Invert colors (white text on black → black on white).
/// </summary>
[JsonPropertyName("invert_colors")]
public bool InvertColors { get; init; } = false;
/// <summary>
/// Parse a <see cref="ImagePreprocessingConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImagePreprocessingConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImagePreprocessingConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImagePreprocessingConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImagePreprocessingConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ImagePreprocessingConfig Default()
{
var nativeResult = NativeMethods.ImagePreprocessingConfigDefault();
var jsonPtr = NativeMethods.ImagePreprocessingConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ImagePreprocessingConfigFree(nativeResult);
return JsonSerializer.Deserialize<ImagePreprocessingConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,131 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image preprocessing metadata.
///
/// Tracks the transformations applied to an image during OCR preprocessing,
/// including DPI normalization, resizing, and resampling.
/// </summary>
public sealed record ImagePreprocessingMetadata
{
/// <summary>
/// Original image dimensions (width, height) in pixels
/// </summary>
[JsonPropertyName("original_dimensions")]
public List<ulong> OriginalDimensions { get; init; } = [];
/// <summary>
/// Original image DPI (horizontal, vertical)
/// </summary>
[JsonPropertyName("original_dpi")]
public List<double> OriginalDpi { get; init; } = [];
/// <summary>
/// Target DPI from configuration
/// </summary>
[JsonPropertyName("target_dpi")]
public int TargetDpi { get; init; } = 0;
/// <summary>
/// Scaling factor applied to the image
/// </summary>
[JsonPropertyName("scale_factor")]
public double ScaleFactor { get; init; } = 0.0;
/// <summary>
/// Whether DPI was auto-adjusted based on content
/// </summary>
[JsonPropertyName("auto_adjusted")]
public bool AutoAdjusted { get; init; } = false;
/// <summary>
/// Final DPI after processing
/// </summary>
[JsonPropertyName("final_dpi")]
public int FinalDpi { get; init; } = 0;
/// <summary>
/// New dimensions after resizing (if resized)
/// </summary>
[JsonPropertyName("new_dimensions")]
public List<ulong>? NewDimensions { get; init; } = null;
/// <summary>
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
/// </summary>
[JsonPropertyName("resample_method")]
public required string ResampleMethod { get; init; }
/// <summary>
/// Whether dimensions were clamped to max_image_dimension
/// </summary>
[JsonPropertyName("dimension_clamped")]
public bool DimensionClamped { get; init; } = false;
/// <summary>
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
/// </summary>
[JsonPropertyName("calculated_dpi")]
public int? CalculatedDpi { get; init; } = null;
/// <summary>
/// Whether resize was skipped (dimensions already optimal)
/// </summary>
[JsonPropertyName("skipped_resize")]
public bool SkippedResize { get; init; } = false;
/// <summary>
/// Error message if resize failed
/// </summary>
[JsonPropertyName("resize_error")]
public string? ResizeError { get; init; } = null;
/// <summary>
/// Parse a <see cref="ImagePreprocessingMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImagePreprocessingMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImagePreprocessingMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImagePreprocessingMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImagePreprocessingMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class ImageProcessingException : KreuzbergErrorException
{
public ImageProcessingException(string message) : base(message) { }
public ImageProcessingException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,69 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image type classification.
/// </summary>
[JsonConverter(typeof(ImageTypeJsonConverter))]
public enum ImageType
{
/// <summary>
/// Data URI image
/// </summary>
[JsonPropertyName("data-uri")]
DataUri,
/// <summary>
/// Inline SVG
/// </summary>
[JsonPropertyName("inline-svg")]
InlineSvg,
/// <summary>
/// External image URL
/// </summary>
[JsonPropertyName("external")]
External,
/// <summary>
/// Relative path image
/// </summary>
[JsonPropertyName("relative")]
Relative,
}
/// <summary>
/// Custom JSON converter for <see cref="ImageType"/> that respects explicit variant names.
/// </summary>
internal sealed class ImageTypeJsonConverter : JsonConverter<ImageType>
{
public override ImageType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"data-uri" => ImageType.DataUri,
"inline-svg" => ImageType.InlineSvg,
"external" => ImageType.External,
"relative" => ImageType.Relative,
_ => throw new JsonException($"Unknown ImageType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ImageType value, JsonSerializerOptions options)
{
var str = value switch
{
ImageType.DataUri => "data-uri",
ImageType.InlineSvg => "inline-svg",
ImageType.External => "external",
ImageType.Relative => "relative",
_ => throw new JsonException($"Unknown ImageType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,82 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Inline element within a block.
///
/// Represents text with formatting, links, images, etc.
/// </summary>
public sealed record InlineElement
{
/// <summary>
/// Type of inline element
/// </summary>
[JsonPropertyName("element_type")]
public required InlineType ElementType { get; init; }
/// <summary>
/// Text content
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// Element attributes
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Additional metadata (e.g., href for links, src/alt for images)
/// </summary>
[JsonPropertyName("metadata")]
public Dictionary<string, string>? Metadata { get; init; } = null;
/// <summary>
/// Parse a <see cref="InlineElement"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static InlineElement FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<InlineElement>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse InlineElement from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse InlineElement from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,105 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Types of inline elements in Djot.
/// </summary>
[JsonConverter(typeof(InlineTypeJsonConverter))]
public enum InlineType
{
[JsonPropertyName("text")]
Text,
[JsonPropertyName("strong")]
Strong,
[JsonPropertyName("emphasis")]
Emphasis,
[JsonPropertyName("highlight")]
Highlight,
[JsonPropertyName("subscript")]
Subscript,
[JsonPropertyName("superscript")]
Superscript,
[JsonPropertyName("insert")]
Insert,
[JsonPropertyName("delete")]
Delete,
[JsonPropertyName("code")]
Code,
[JsonPropertyName("link")]
Link,
[JsonPropertyName("image")]
Image,
[JsonPropertyName("span")]
Span,
[JsonPropertyName("math")]
Math,
[JsonPropertyName("raw_inline")]
RawInline,
[JsonPropertyName("footnote_ref")]
FootnoteRef,
[JsonPropertyName("symbol")]
Symbol,
}
/// <summary>
/// Custom JSON converter for <see cref="InlineType"/> that respects explicit variant names.
/// </summary>
internal sealed class InlineTypeJsonConverter : JsonConverter<InlineType>
{
public override InlineType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"text" => InlineType.Text,
"strong" => InlineType.Strong,
"emphasis" => InlineType.Emphasis,
"highlight" => InlineType.Highlight,
"subscript" => InlineType.Subscript,
"superscript" => InlineType.Superscript,
"insert" => InlineType.Insert,
"delete" => InlineType.Delete,
"code" => InlineType.Code,
"link" => InlineType.Link,
"image" => InlineType.Image,
"span" => InlineType.Span,
"math" => InlineType.Math,
"raw_inline" => InlineType.RawInline,
"footnote_ref" => InlineType.FootnoteRef,
"symbol" => InlineType.Symbol,
_ => throw new JsonException($"Unknown InlineType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, InlineType value, JsonSerializerOptions options)
{
var str = value switch
{
InlineType.Text => "text",
InlineType.Strong => "strong",
InlineType.Emphasis => "emphasis",
InlineType.Highlight => "highlight",
InlineType.Subscript => "subscript",
InlineType.Superscript => "superscript",
InlineType.Insert => "insert",
InlineType.Delete => "delete",
InlineType.Code => "code",
InlineType.Link => "link",
InlineType.Image => "image",
InlineType.Span => "span",
InlineType.Math => "math",
InlineType.RawInline => "raw_inline",
InlineType.FootnoteRef => "footnote_ref",
InlineType.Symbol => "symbol",
_ => throw new JsonException($"Unknown InlineType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class IoException : KreuzbergErrorException
{
public IoException(string message) : base(message) { }
public IoException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// JATS (Journal Article Tag Suite) metadata.
/// </summary>
public sealed record JatsMetadata
{
[JsonPropertyName("copyright")]
public string? Copyright { get; init; } = null;
[JsonPropertyName("license")]
public string? License { get; init; } = null;
[JsonPropertyName("history_dates")]
public Dictionary<string, string> HistoryDates { get; init; } = new Dictionary<string, string>();
[JsonPropertyName("contributor_roles")]
public List<ContributorRole> ContributorRoles { get; init; } = [];
/// <summary>
/// Parse a <see cref="JatsMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static JatsMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<JatsMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse JatsMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse JatsMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

80
packages/csharp/src/Kreuzberg/Keyword.cs generated Normal file
View File

@@ -0,0 +1,80 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Extracted keyword with metadata.
/// </summary>
public sealed record Keyword
{
/// <summary>
/// The keyword text.
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// Relevance score (higher is better, algorithm-specific range).
/// </summary>
[JsonPropertyName("score")]
public float Score { get; init; } = 0.0f;
/// <summary>
/// Algorithm that extracted this keyword.
/// </summary>
[JsonPropertyName("algorithm")]
public required KeywordAlgorithm Algorithm { get; init; }
/// <summary>
/// Optional positions where keyword appears in text (character offsets).
/// </summary>
[JsonPropertyName("positions")]
public List<ulong>? Positions { get; init; } = null;
/// <summary>
/// Parse a <see cref="Keyword"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static Keyword FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<Keyword>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse Keyword from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse Keyword from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,55 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Keyword algorithm selection.
/// </summary>
[JsonConverter(typeof(KeywordAlgorithmJsonConverter))]
public enum KeywordAlgorithm
{
/// <summary>
/// YAKE (Yet Another Keyword Extractor) - statistical approach
/// </summary>
[JsonPropertyName("yake")]
Yake,
/// <summary>
/// RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
/// </summary>
[JsonPropertyName("rake")]
Rake,
}
/// <summary>
/// Custom JSON converter for <see cref="KeywordAlgorithm"/> that respects explicit variant names.
/// </summary>
internal sealed class KeywordAlgorithmJsonConverter : JsonConverter<KeywordAlgorithm>
{
public override KeywordAlgorithm Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"yake" => KeywordAlgorithm.Yake,
"rake" => KeywordAlgorithm.Rake,
_ => throw new JsonException($"Unknown KeywordAlgorithm value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, KeywordAlgorithm value, JsonSerializerOptions options)
{
var str = value switch
{
KeywordAlgorithm.Yake => "yake",
KeywordAlgorithm.Rake => "rake",
_ => throw new JsonException($"Unknown KeywordAlgorithm value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,117 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Keyword extraction configuration.
/// </summary>
public sealed record KeywordConfig
{
/// <summary>
/// Algorithm to use for extraction.
/// </summary>
[JsonPropertyName("algorithm")]
public KeywordAlgorithm? Algorithm { get; init; } = null;
/// <summary>
/// Maximum number of keywords to extract (default: 10).
/// </summary>
[JsonPropertyName("max_keywords")]
public ulong MaxKeywords { get; init; } = 10;
/// <summary>
/// Minimum score threshold (0.0-1.0, default: 0.0).
///
/// Keywords with scores below this threshold are filtered out.
/// Note: Score ranges differ between algorithms.
/// </summary>
[JsonPropertyName("min_score")]
public float MinScore { get; init; } = 0.0f;
/// <summary>
/// N-gram range for keyword extraction (min, max).
///
/// (1, 1) = unigrams only
/// (1, 2) = unigrams and bigrams
/// (1, 3) = unigrams, bigrams, and trigrams (default)
/// </summary>
[JsonPropertyName("ngram_range")]
public List<ulong>? NgramRange { get; init; } = null;
/// <summary>
/// Language code for stopword filtering (e.g., "en", "de", "fr").
///
/// If null, no stopword filtering is applied.
/// </summary>
[JsonPropertyName("language")]
public string? Language { get; init; } = null;
/// <summary>
/// YAKE-specific tuning parameters.
/// </summary>
[JsonPropertyName("yake_params")]
public YakeParams? YakeParams { get; init; } = null;
/// <summary>
/// RAKE-specific tuning parameters.
/// </summary>
[JsonPropertyName("rake_params")]
public RakeParams? RakeParams { get; init; } = null;
/// <summary>
/// Parse a <see cref="KeywordConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static KeywordConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<KeywordConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse KeywordConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse KeywordConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static KeywordConfig Default()
{
var nativeResult = NativeMethods.KeywordConfigDefault();
var jsonPtr = NativeMethods.KeywordConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.KeywordConfigFree(nativeResult);
return JsonSerializer.Deserialize<KeywordConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,35 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
/// <summary>
/// Main error type for all Kreuzberg operations.
///
/// All errors in Kreuzberg use this enum, which preserves error chains
/// and provides context for debugging.
///
/// # Variants
///
/// - `Io` - File system and I/O errors (always bubble up)
/// - `Parsing` - Document parsing errors (corrupt files, unsupported features)
/// - `Ocr` - OCR processing errors
/// - `Validation` - Input validation errors (invalid paths, config, parameters)
/// - `Cache` - Cache operation errors (non-fatal, can be ignored)
/// - `ImageProcessing` - Image manipulation errors
/// - `Serialization` - JSON/MessagePack serialization errors
/// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
/// - `Plugin` - Plugin-specific errors
/// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
/// - `UnsupportedFormat` - Unsupported MIME type or file format
/// - `Other` - Catch-all for uncommon errors
/// </summary>
public class KreuzbergErrorException : KreuzbergException
{
public KreuzbergErrorException(string message) : base(message) { }
public KreuzbergErrorException(string message, Exception innerException) : base(message, innerException) { }
}

Some files were not shown because too many files have changed in this diff Show More