Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,10 @@
root = true
[*.cs]
indent_style = space
indent_size = 4
max_line_length = 120
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

View File

@@ -0,0 +1,9 @@
<!-- auto-generated by alef (generate_bindings) -->
<!-- alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 -->
<Project>
<PropertyGroup>
<Nullable>enable</Nullable>
<LangVersion>latest</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
</Project>

View File

@@ -0,0 +1,24 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<RootNamespace>Kreuzberg</RootNamespace>
<PackageId>Kreuzberg</PackageId>
<Version>5.0.0-rc.3</Version>
<Description>High-performance document intelligence library</Description>
<PackageLicenseFile>LICENSE</PackageLicenseFile>
<RepositoryUrl>https://github.com/kreuzberg-dev/kreuzberg</RepositoryUrl>
<Authors>Na&apos;aman Hirschfeld &lt;naaman@kreuzberg.dev&gt;</Authors>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<Nullable>enable</Nullable>
<GenerateAssemblyInfo>false</GenerateAssemblyInfo>
</PropertyGroup>
<ItemGroup>
<None Include="../../../LICENSE" Pack="true" PackagePath="/" />
<None Include="runtimes/**" Pack="true" PackagePath="runtimes/" CopyToOutputDirectory="PreserveNewest" />
</ItemGroup>
<ItemGroup>
<Compile Include="../src/**/*.cs" />
</ItemGroup>
</Project>

93
packages/csharp/LICENSE Normal file
View File

@@ -0,0 +1,93 @@
Elastic License 2.0 (ELv2)
Copyright 2025-2026 Kreuzberg, Inc.
Acceptance
By using the software, you agree to all of the terms and conditions below.
Copyright License
The licensor grants you a non-exclusive, royalty-free, worldwide,
non-sublicensable, non-transferable license to use, copy, distribute, make
available, and prepare derivative works of the software, in each case subject to
the limitations and conditions below.
Limitations
You may not provide the software to third parties as a hosted or managed
service, where the service provides users with access to any substantial set of
the features or functionality of the software.
You may not move, change, disable, or circumvent the license key functionality
in the software, and you may not remove or obscure any functionality in the
software that is protected by the license key.
You may not alter, remove, or obscure any licensing, copyright, or other notices
of the licensor in the software. Any use of the licensor's trademarks is subject
to applicable law.
Patents
The licensor grants you a license, under any patent claims the licensor can
license, or becomes able to license, to make, have made, use, sell, offer for
sale, import and have imported the software, in each case subject to the
limitations and conditions in this license. This license does not cover any
patent claims that you cause to be infringed by modifications or additions to the
software. If you or your company make any written claim that the software
infringes or contributes to infringement of any patent, your patent license for
the software granted under these terms ends immediately. If your company makes
such a claim, your patent license ends immediately for work on behalf of your
company.
Notices
You must ensure that anyone who gets a copy of any part of the software from you
also gets a copy of these terms.
If you modify the software, you must include in any modified copies of the
software prominent notices stating that you have modified the software.
No Other Rights
These terms do not imply any licenses other than those expressly granted in
these terms.
Termination
If you use the software in violation of these terms, such use is not licensed,
and your licenses will automatically terminate. If the licensor provides you with
a notice of your violation, and you cease all violation of this license no later
than 30 days after you receive that notice, your licenses will be reinstated
retroactively. However, if you violate these terms after such reinstatement, any
additional violation of these terms will cause your licenses to terminate
automatically and permanently.
No Liability
As far as the law allows, the software comes as is, without any warranty or
condition, and the licensor will not be liable to you for any damages arising out
of these terms or the use or nature of the software, under any kind of legal
claim.
Definitions
The licensor is the entity offering these terms, and the software is the
software the licensor makes available under these terms, including any portion
of it.
you refers to the individual or entity agreeing to these terms.
your company is any legal entity, sole proprietorship, or other kind of
organization that you work for, plus all organizations that have control over,
are under the control of, or are under common control with that organization.
control means ownership of substantially all the assets of an entity, or the
power to direct its management and policies by vote, contract, or otherwise.
Control can be direct or indirect.
your licenses are all the licenses granted to you for the software under these
terms.
use means anything you do with the software requiring one of your licenses.
trademark means trademarks, service marks, and similar rights.

518
packages/csharp/README.md Normal file
View File

@@ -0,0 +1,518 @@
# C#
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
<a href="https://github.com/kreuzberg-dev/alef">
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
</a>
<!-- Language Bindings -->
<a href="https://crates.io/crates/kreuzberg">
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
</a>
<a href="https://pypi.org/project/kreuzberg/">
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/node">
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
</a>
<a href="https://www.nuget.org/packages/Kreuzberg/">
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
</a>
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
</a>
<a href="https://rubygems.org/gems/kreuzberg">
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
</a>
<a href="https://hex.pm/packages/kreuzberg">
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
</a>
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
</a>
<a href="https://pub.dev/packages/kreuzberg">
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
</a>
<!-- Project Info -->
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
</a>
<a href="https://docs.kreuzberg.dev">
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
</a>
<a href="https://huggingface.co/Kreuzberg">
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
</a>
</div>
<div align="center" style="margin: 24px 0 0;">
<a href="https://kreuzberg.dev">
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
</a>
</div>
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
<a href="https://discord.gg/xt9WY3GnKR">
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
</a>
<a href="https://docs.kreuzberg.dev/demo.html">
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
</a>
</div>
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. .NET bindings with full type safety, async/await support, and .NET 10.0+ compatibility.
## What This Package Provides
- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
- **.NET package** — async/await API with nullable-aware result types.
## Installation
### Package Installation
Install via NuGet:
```bash
dotnet add package Kreuzberg
```
Or via NuGet Package Manager:
```
Install-Package Kreuzberg
```
### System Requirements
- **.NET 10.0+** required
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
## Quick Start
### Basic Extraction
Extract text, metadata, and structure from any supported document format:
<!-- snippet not found: getting-started/basic_usage.cs -->
### Common Use Cases
#### Extract with Custom Configuration
Most use cases benefit from configuration to control extraction behavior:
**With OCR (for scanned documents):**
```cs
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu+fra",
TesseractConfig = new TesseractConfig
{
Psm = 3
}
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine(result.Content);
```
#### Table Extraction
See [Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/) for table extraction options.
#### Processing Multiple Files
```cs
using Kreuzberg;
using System.Collections.Generic;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var filePaths = new[]
{
"document1.pdf",
"document2.pdf",
"document3.pdf"
};
try
{
var batchResults = new List<ExtractionResult>();
foreach (var filePath in filePaths)
{
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
batchResults.Add(result);
Console.WriteLine($"Processed {filePath}: {result.Content.Length} chars");
}
var tasks = filePaths.Select(path =>
KreuzbergLib.ExtractFileAsync(path, config)
).ToArray();
var results = await Task.WhenAll(tasks);
var totalChars = results.Sum(r => r.Content.Length);
Console.WriteLine($"Total extracted: {totalChars} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Batch processing error: {ex.Message}");
}
}
}
```
#### Async Processing
For non-blocking document processing:
```cs
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
Console.WriteLine($"Content length: {result.Content.Length}");
Console.WriteLine($"MIME type: {result.MimeType}");
var tasks = new[]
{
KreuzbergLib.ExtractFileAsync("file1.pdf"),
KreuzbergLib.ExtractFileAsync("file2.pdf"),
KreuzbergLib.ExtractFileAsync("file3.pdf")
};
var results = await Task.WhenAll(tasks);
foreach (var r in results)
{
Console.WriteLine($"Extracted {r.Content.Length} characters");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}
```
### Next Steps
- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
## Features
### Supported File Formats (90+)
90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
#### Office Documents
| Category | Formats | Capabilities |
|----------|---------|--------------|
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
| **Database** | `.dbf` | Table data extraction, field type support |
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
#### Images (OCR-Enabled)
| Category | Formats | Features |
|----------|---------|----------|
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
#### Web & Data
| Category | Formats | Features |
|----------|---------|----------|
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
#### Email & Archives
| Category | Formats | Features |
|----------|---------|----------|
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
#### Academic & Scientific
| Category | Formats | Features |
|----------|---------|----------|
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
#### Code Intelligence (300+ Languages)
| Feature | Description |
|---------|-------------|
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
| **Symbol Extraction** | Variables, constants, type aliases, properties |
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
| **Diagnostics** | Parse errors with line/column positions |
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
### Key Capabilities
- **Text Extraction** - Extract all text content with position and formatting information
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
- **Table Extraction** - Parse tables with structure and cell content preservation
- **Image Extraction** - Extract embedded images and render page previews
- **OCR Support** - Integrate multiple OCR backends for scanned documents
- **Async/Await** - Non-blocking document processing with concurrent operations
- **Plugin System** - Extensible post-processing for custom text transformation
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
- **Batch Processing** - Efficiently process multiple documents in parallel
- **Memory Efficient** - Stream large files without loading entirely into memory
- **Language Detection** - Detect and support multiple languages in documents
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
- **Configuration** - Fine-grained control over extraction behavior
### Performance Characteristics
| Format | Speed | Memory | Notes |
|--------|-------|--------|-------|
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
## OCR Support
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
- **Tesseract**
- **Paddleocr**
### OCR Configuration Example
```cs
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+deu+fra",
TesseractConfig = new TesseractConfig
{
Psm = 3
}
}
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
Console.WriteLine(result.Content);
```
## Async Support
This binding provides full async/await support for non-blocking document processing:
```cs
using Kreuzberg;
class Program
{
static async Task Main()
{
try
{
var result = await KreuzbergLib.ExtractFileAsync("document.pdf");
Console.WriteLine($"Content length: {result.Content.Length}");
Console.WriteLine($"MIME type: {result.MimeType}");
var tasks = new[]
{
KreuzbergLib.ExtractFileAsync("file1.pdf"),
KreuzbergLib.ExtractFileAsync("file2.pdf"),
KreuzbergLib.ExtractFileAsync("file3.pdf")
};
var results = await Task.WhenAll(tasks);
foreach (var r in results)
{
Console.WriteLine($"Extracted {r.Content.Length} characters");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}
```
## Plugin System
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
## Embeddings Support
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
## Batch Processing
Process multiple documents efficiently:
```cs
using Kreuzberg;
using System.Collections.Generic;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true
};
var filePaths = new[]
{
"document1.pdf",
"document2.pdf",
"document3.pdf"
};
try
{
var batchResults = new List<ExtractionResult>();
foreach (var filePath in filePaths)
{
var result = await KreuzbergLib.ExtractFileAsync(filePath, config);
batchResults.Add(result);
Console.WriteLine($"Processed {filePath}: {result.Content.Length} chars");
}
var tasks = filePaths.Select(path =>
KreuzbergLib.ExtractFileAsync(path, config)
).ToArray();
var results = await Task.WhenAll(tasks);
var totalChars = results.Sum(r => r.Content.Length);
Console.WriteLine($"Total extracted: {totalChars} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Batch processing error: {ex.Message}");
}
}
}
```
## Configuration
For advanced configuration options including language detection, table extraction, OCR settings, and more:
**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
## Documentation
- **[Official Documentation](https://docs.kreuzberg.dev/)**
- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
- **[Examples & Guides](https://docs.kreuzberg.dev/)**
## Contributing
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
## Part of Kreuzberg.dev
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
## License
Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
## Support
- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Hardware acceleration configuration for ONNX Runtime models.
///
/// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
/// for inference in layout detection and embedding generation.
/// </summary>
public sealed record AccelerationConfig
{
/// <summary>
/// Execution provider to use for ONNX inference.
/// </summary>
[JsonPropertyName("provider")]
public ExecutionProviderType? Provider { get; init; } = null;
/// <summary>
/// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
/// </summary>
[JsonPropertyName("device_id")]
public uint DeviceId { get; init; } = 0;
/// <summary>
/// Parse a <see cref="AccelerationConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static AccelerationConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<AccelerationConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse AccelerationConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse AccelerationConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,205 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Types of inline text annotations.
/// </summary>
[JsonConverter(typeof(AnnotationKindJsonConverter))]
public abstract record AnnotationKind
{
public sealed record Bold() : AnnotationKind;
public sealed record Italic() : AnnotationKind;
public sealed record Underline() : AnnotationKind;
public sealed record Strikethrough() : AnnotationKind;
public sealed record Code() : AnnotationKind;
public sealed record Subscript() : AnnotationKind;
public sealed record Superscript() : AnnotationKind;
public sealed record Link(
[property: JsonPropertyName("url")] string Url,
[property: JsonPropertyName("title")] string? Title
) : AnnotationKind;
/// <summary>
/// Highlighted text (PDF highlights, HTML `&lt;mark&gt;`).
/// </summary>
public sealed record Highlight() : AnnotationKind;
/// <summary>
/// Text color (CSS-compatible value, e.g. "#ff0000", "red").
/// </summary>
public sealed record Color(
[property: JsonPropertyName("value")] string Value
) : AnnotationKind;
/// <summary>
/// Font size with units (e.g. "12pt", "1.2em", "16px").
/// </summary>
public sealed record FontSize(
[property: JsonPropertyName("value")] string Value
) : AnnotationKind;
/// <summary>
/// Extensible annotation for format-specific styling.
/// </summary>
public sealed record Custom(
[property: JsonPropertyName("name")] string Name,
[property: JsonPropertyName("value")] string? Value
) : AnnotationKind;
}
/// <summary>
/// Custom converter for AnnotationKind sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (annotation_type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class AnnotationKindJsonConverter : JsonConverter<AnnotationKind>
{
public override AnnotationKind Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("annotation_type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: annotation_type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "annotation_type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "annotation_type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "bold" => new AnnotationKind.Bold(), "italic" => new AnnotationKind.Italic(), "underline" => new AnnotationKind.Underline(), "strikethrough" => new AnnotationKind.Strikethrough(), "code" => new AnnotationKind.Code(), "subscript" => new AnnotationKind.Subscript(), "superscript" => new AnnotationKind.Superscript(), "link" => JsonSerializer.Deserialize<AnnotationKind.Link>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "highlight" => new AnnotationKind.Highlight(), "color" => JsonSerializer.Deserialize<AnnotationKind.Color>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "font_size" => JsonSerializer.Deserialize<AnnotationKind.FontSize>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "custom" => JsonSerializer.Deserialize<AnnotationKind.Custom>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown AnnotationKind discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, AnnotationKind value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"annotation_type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field annotation_type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case AnnotationKind.Bold _:
tag = "bold";
inner = null;
break; case AnnotationKind.Italic _:
tag = "italic";
inner = null;
break; case AnnotationKind.Underline _:
tag = "underline";
inner = null;
break; case AnnotationKind.Strikethrough _:
tag = "strikethrough";
inner = null;
break; case AnnotationKind.Code _:
tag = "code";
inner = null;
break; case AnnotationKind.Subscript _:
tag = "subscript";
inner = null;
break; case AnnotationKind.Superscript _:
tag = "superscript";
inner = null;
break; case AnnotationKind.Link v_link:
tag = "link"; inner = v_link; break; case AnnotationKind.Highlight _:
tag = "highlight";
inner = null;
break; case AnnotationKind.Color v_color:
tag = "color"; inner = v_color; break; case AnnotationKind.FontSize v_fontsize:
tag = "font_size"; inner = v_fontsize; break; case AnnotationKind.Custom v_custom:
tag = "custom"; inner = v_custom; break; default:
throw new JsonException($"Unknown AnnotationKind variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("annotation_type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,77 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single file extracted from an archive.
///
/// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
/// enabled, each processable file produces its own full `ExtractionResult`.
/// </summary>
public sealed record ArchiveEntry
{
/// <summary>
/// Archive-relative file path (e.g. "folder/document.pdf").
/// </summary>
[JsonPropertyName("path")]
public required string Path { get; init; }
/// <summary>
/// Detected MIME type of the file.
/// </summary>
[JsonPropertyName("mime_type")]
public required string MimeType { get; init; }
/// <summary>
/// Full extraction result for this file.
/// </summary>
[JsonPropertyName("result")]
public required ExtractionResult Result { get; init; }
/// <summary>
/// Parse a <see cref="ArchiveEntry"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ArchiveEntry FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ArchiveEntry>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ArchiveEntry from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ArchiveEntry from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,88 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Archive (ZIP/TAR/7Z) metadata.
///
/// Extracted from compressed archive files containing file lists and size information.
/// </summary>
public sealed record ArchiveMetadata
{
/// <summary>
/// Archive format ("ZIP", "TAR", "7Z", etc.)
/// </summary>
[JsonPropertyName("format")]
public string Format { get; init; } = "";
/// <summary>
/// Total number of files in the archive
/// </summary>
[JsonPropertyName("file_count")]
public uint FileCount { get; init; } = 0;
/// <summary>
/// List of file paths within the archive
/// </summary>
[JsonPropertyName("file_list")]
public List<string> FileList { get; init; } = [];
/// <summary>
/// Total uncompressed size in bytes
/// </summary>
[JsonPropertyName("total_size")]
public ulong TotalSize { get; init; } = 0;
/// <summary>
/// Compressed size in bytes (if available)
/// </summary>
[JsonPropertyName("compressed_size")]
public ulong? CompressedSize { get; init; } = null;
/// <summary>
/// Parse a <see cref="ArchiveMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ArchiveMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ArchiveMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ArchiveMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ArchiveMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

68
packages/csharp/src/Kreuzberg/BBox.cs generated Normal file
View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
/// </summary>
public sealed record BBox
{
[JsonPropertyName("x1")]
public float X1 { get; init; } = 0.0f;
[JsonPropertyName("y1")]
public float Y1 { get; init; } = 0.0f;
[JsonPropertyName("x2")]
public float X2 { get; init; } = 0.0f;
[JsonPropertyName("y2")]
public float Y2 { get; init; } = 0.0f;
/// <summary>
/// Parse a <see cref="BBox"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BBox FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BBox>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BBox from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BBox from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,78 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Batch item for byte array extraction.
///
/// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
/// to represent a single item in a batch extraction job.
/// </summary>
public sealed record BatchBytesItem
{
/// <summary>
/// The content bytes to extract from
/// </summary>
[JsonConverter(typeof(ByteArrayToIntArrayConverter))]
[JsonPropertyName("content")]
public byte[] Content { get; init; } = [];
/// <summary>
/// MIME type of the content (e.g., "application/pdf", "text/html")
/// </summary>
[JsonPropertyName("mime_type")]
public required string MimeType { get; init; }
/// <summary>
/// Per-item configuration overrides (null uses batch-level defaults)
/// </summary>
[JsonPropertyName("config")]
public FileExtractionConfig? Config { get; init; } = null;
/// <summary>
/// Parse a <see cref="BatchBytesItem"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BatchBytesItem FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BatchBytesItem>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BatchBytesItem from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BatchBytesItem from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Batch item for file extraction.
///
/// Used with `batch_extract_files` and `batch_extract_files_sync`
/// to represent a single file in a batch extraction job.
/// </summary>
public sealed record BatchFileItem
{
/// <summary>
/// Path to the file to extract from
/// </summary>
[JsonPropertyName("path")]
public required string Path { get; init; }
/// <summary>
/// Per-file configuration overrides (null uses batch-level defaults)
/// </summary>
[JsonPropertyName("config")]
public FileExtractionConfig? Config { get; init; } = null;
/// <summary>
/// Parse a <see cref="BatchFileItem"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BatchFileItem FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BatchFileItem>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BatchFileItem from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BatchFileItem from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// BibTeX bibliography metadata.
/// </summary>
public sealed record BibtexMetadata
{
/// <summary>
/// Number of entries in the bibliography.
/// </summary>
[JsonPropertyName("entry_count")]
public ulong EntryCount { get; init; } = 0;
[JsonPropertyName("citation_keys")]
public List<string> CitationKeys { get; init; } = [];
[JsonPropertyName("authors")]
public List<string> Authors { get; init; } = [];
[JsonPropertyName("year_range")]
public YearRange? YearRange { get; init; } = null;
[JsonPropertyName("entry_types")]
public Dictionary<string, ulong>? EntryTypes { get; init; } = null;
/// <summary>
/// Parse a <see cref="BibtexMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BibtexMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BibtexMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BibtexMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BibtexMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

105
packages/csharp/src/Kreuzberg/BlockType.cs generated Normal file
View File

@@ -0,0 +1,105 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Types of block-level elements in Djot.
/// </summary>
[JsonConverter(typeof(BlockTypeJsonConverter))]
public enum BlockType
{
[JsonPropertyName("paragraph")]
Paragraph,
[JsonPropertyName("heading")]
Heading,
[JsonPropertyName("blockquote")]
Blockquote,
[JsonPropertyName("code_block")]
CodeBlock,
[JsonPropertyName("list_item")]
ListItem,
[JsonPropertyName("ordered_list")]
OrderedList,
[JsonPropertyName("bullet_list")]
BulletList,
[JsonPropertyName("task_list")]
TaskList,
[JsonPropertyName("definition_list")]
DefinitionList,
[JsonPropertyName("definition_term")]
DefinitionTerm,
[JsonPropertyName("definition_description")]
DefinitionDescription,
[JsonPropertyName("div")]
Div,
[JsonPropertyName("section")]
Section,
[JsonPropertyName("thematic_break")]
ThematicBreak,
[JsonPropertyName("raw_block")]
RawBlock,
[JsonPropertyName("math_display")]
MathDisplay,
}
/// <summary>
/// Custom JSON converter for <see cref="BlockType"/> that respects explicit variant names.
/// </summary>
internal sealed class BlockTypeJsonConverter : JsonConverter<BlockType>
{
public override BlockType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"paragraph" => BlockType.Paragraph,
"heading" => BlockType.Heading,
"blockquote" => BlockType.Blockquote,
"code_block" => BlockType.CodeBlock,
"list_item" => BlockType.ListItem,
"ordered_list" => BlockType.OrderedList,
"bullet_list" => BlockType.BulletList,
"task_list" => BlockType.TaskList,
"definition_list" => BlockType.DefinitionList,
"definition_term" => BlockType.DefinitionTerm,
"definition_description" => BlockType.DefinitionDescription,
"div" => BlockType.Div,
"section" => BlockType.Section,
"thematic_break" => BlockType.ThematicBreak,
"raw_block" => BlockType.RawBlock,
"math_display" => BlockType.MathDisplay,
_ => throw new JsonException($"Unknown BlockType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, BlockType value, JsonSerializerOptions options)
{
var str = value switch
{
BlockType.Paragraph => "paragraph",
BlockType.Heading => "heading",
BlockType.Blockquote => "blockquote",
BlockType.CodeBlock => "code_block",
BlockType.ListItem => "list_item",
BlockType.OrderedList => "ordered_list",
BlockType.BulletList => "bullet_list",
BlockType.TaskList => "task_list",
BlockType.DefinitionList => "definition_list",
BlockType.DefinitionTerm => "definition_term",
BlockType.DefinitionDescription => "definition_description",
BlockType.Div => "div",
BlockType.Section => "section",
BlockType.ThematicBreak => "thematic_break",
BlockType.RawBlock => "raw_block",
BlockType.MathDisplay => "math_display",
_ => throw new JsonException($"Unknown BlockType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,80 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Bounding box coordinates for element positioning.
/// </summary>
public sealed record BoundingBox
{
/// <summary>
/// Left x-coordinate
/// </summary>
[JsonPropertyName("x0")]
public double X0 { get; init; } = 0.0;
/// <summary>
/// Bottom y-coordinate
/// </summary>
[JsonPropertyName("y0")]
public double Y0 { get; init; } = 0.0;
/// <summary>
/// Right x-coordinate
/// </summary>
[JsonPropertyName("x1")]
public double X1 { get; init; } = 0.0;
/// <summary>
/// Top y-coordinate
/// </summary>
[JsonPropertyName("y1")]
public double Y1 { get; init; } = 0.0;
/// <summary>
/// Parse a <see cref="BoundingBox"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static BoundingBox FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<BoundingBox>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse BoundingBox from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse BoundingBox from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Converts byte arrays to and from JSON integer arrays.
/// </summary>
/// <remarks>
/// System.Text.Json serializes byte[] as base64 strings by default, but Rust's serde
/// for Vec&lt;u8&gt; expects JSON arrays of integers [72, 101, 108, ...].
/// Apply this converter to byte[] fields that are serialized to FFI with
/// [JsonConverter(typeof(ByteArrayToIntArrayConverter))].
/// </remarks>
public sealed class ByteArrayToIntArrayConverter : JsonConverter<byte[]>
{
/// <summary>
/// Reads a JSON array of integers and converts it to a byte array.
/// </summary>
public override byte[]? Read(
ref Utf8JsonReader reader,
Type typeToConvert,
JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartArray)
{
throw new JsonException("Expected JSON array for byte[]");
}
var bytes = new List<byte>();
while (reader.Read())
{
if (reader.TokenType == JsonTokenType.EndArray)
{
break;
}
if (reader.TokenType == JsonTokenType.Number)
{
bytes.Add((byte)reader.GetInt32());
}
else
{
throw new JsonException($"Unexpected token type: {reader.TokenType}");
}
}
return bytes.ToArray();
}
/// <summary>
/// Writes a byte array as a JSON array of integers.
/// </summary>
public override void Write(
Utf8JsonWriter writer,
byte[] value,
JsonSerializerOptions options)
{
writer.WriteStartArray();
foreach (var b in value)
{
writer.WriteNumberValue(b);
}
writer.WriteEndArray();
}
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class CacheException : KreuzbergErrorException
{
public CacheException(string message) : base(message) { }
public CacheException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
public sealed record CacheStats
{
[JsonPropertyName("total_files")]
public ulong TotalFiles { get; init; } = 0;
[JsonPropertyName("total_size_mb")]
public double TotalSizeMb { get; init; } = 0.0;
[JsonPropertyName("available_space_mb")]
public double AvailableSpaceMb { get; init; } = 0.0;
[JsonPropertyName("oldest_file_age_days")]
public double OldestFileAgeDays { get; init; } = 0.0;
[JsonPropertyName("newest_file_age_days")]
public double NewestFileAgeDays { get; init; } = 0.0;
/// <summary>
/// Parse a <see cref="CacheStats"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CacheStats FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CacheStats>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CacheStats from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CacheStats from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class CancelledException : KreuzbergErrorException
{
public CancelledException(string message) : base(message) { }
public CancelledException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single changed cell within a table.
///
/// Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
/// reference it unconditionally, without requiring the `diff` Cargo feature.
/// `crate.diff` re-exports this type verbatim.
/// </summary>
public sealed record CellChange
{
/// <summary>
/// Zero-based row index.
/// </summary>
[JsonPropertyName("row")]
public ulong Row { get; init; } = 0;
/// <summary>
/// Zero-based column index.
/// </summary>
[JsonPropertyName("col")]
public ulong Col { get; init; } = 0;
/// <summary>
/// Value before the change.
/// </summary>
[JsonPropertyName("from")]
public required string From { get; init; }
/// <summary>
/// Value after the change.
/// </summary>
[JsonPropertyName("to")]
public required string To { get; init; }
/// <summary>
/// Parse a <see cref="CellChange"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CellChange FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CellChange>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CellChange from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CellChange from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

90
packages/csharp/src/Kreuzberg/Chunk.cs generated Normal file
View File

@@ -0,0 +1,90 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A text chunk with optional embedding and metadata.
///
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
/// contains the text content, optional embedding vector (if embedding generation
/// is configured), and metadata about its position in the document.
/// </summary>
public sealed record Chunk
{
/// <summary>
/// The text content of this chunk.
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// Semantic structural classification of this chunk.
///
/// Assigned by the heuristic classifier based on content patterns and
/// heading context. Defaults to `ChunkType.Unknown` when no rule matches.
/// </summary>
[JsonPropertyName("chunk_type")]
public ChunkType? ChunkType { get; init; } = null;
/// <summary>
/// Optional embedding vector for this chunk.
///
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
/// The dimensionality depends on the chosen embedding model.
/// </summary>
[JsonPropertyName("embedding")]
public List<float>? Embedding { get; init; } = null;
/// <summary>
/// Metadata about this chunk's position and properties.
/// </summary>
[JsonPropertyName("metadata")]
public required ChunkMetadata Metadata { get; init; }
/// <summary>
/// Parse a <see cref="Chunk"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static Chunk FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<Chunk>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse Chunk from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse Chunk from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,123 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Metadata about a chunk's position in the original document.
/// </summary>
public sealed record ChunkMetadata
{
/// <summary>
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
/// </summary>
[JsonPropertyName("byte_start")]
public ulong ByteStart { get; init; } = 0;
/// <summary>
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
/// </summary>
[JsonPropertyName("byte_end")]
public ulong ByteEnd { get; init; } = 0;
/// <summary>
/// Number of tokens in this chunk (if available).
///
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
/// </summary>
[JsonPropertyName("token_count")]
public ulong? TokenCount { get; init; } = null;
/// <summary>
/// Zero-based index of this chunk in the document.
/// </summary>
[JsonPropertyName("chunk_index")]
public ulong ChunkIndex { get; init; } = 0;
/// <summary>
/// Total number of chunks in the document.
/// </summary>
[JsonPropertyName("total_chunks")]
public ulong TotalChunks { get; init; } = 0;
/// <summary>
/// First page number this chunk spans (1-indexed).
///
/// Only populated when page tracking is enabled in extraction configuration.
/// </summary>
[JsonPropertyName("first_page")]
public uint? FirstPage { get; init; } = null;
/// <summary>
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
///
/// Only populated when page tracking is enabled in extraction configuration.
/// </summary>
[JsonPropertyName("last_page")]
public uint? LastPage { get; init; } = null;
/// <summary>
/// Heading context when using Markdown chunker.
///
/// Contains the heading hierarchy this chunk falls under.
/// Only populated when `ChunkerType.Markdown` is used.
/// </summary>
[JsonPropertyName("heading_context")]
public HeadingContext? HeadingContext { get; init; } = null;
/// <summary>
/// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
///
/// Contains zero-based indices into the top-level `images` collection for every
/// image whose `page_number` falls within `[first_page, last_page]`.
/// Empty when image extraction is disabled or the chunk spans no pages with images.
/// </summary>
[JsonPropertyName("image_indices")]
public List<uint> ImageIndices { get; init; } = [];
/// <summary>
/// Parse a <see cref="ChunkMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ChunkMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ChunkMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ChunkMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ChunkMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,155 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// How chunk size is measured.
///
/// Defaults to `Characters` (Unicode character count). When using token-based sizing,
/// chunks are sized by token count according to the specified tokenizer.
///
/// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
/// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
/// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
/// </summary>
[JsonConverter(typeof(ChunkSizingJsonConverter))]
public abstract record ChunkSizing
{
/// <summary>
/// Size measured in Unicode characters (default).
/// </summary>
public sealed record Characters() : ChunkSizing;
/// <summary>
/// Size measured in tokens from a HuggingFace tokenizer.
/// </summary>
public sealed record Tokenizer(
[property: JsonPropertyName("model")] string Model,
[property: JsonPropertyName("cache_dir")] string? CacheDir
) : ChunkSizing;
}
/// <summary>
/// Custom converter for ChunkSizing sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class ChunkSizingJsonConverter : JsonConverter<ChunkSizing>
{
public override ChunkSizing Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "characters" => new ChunkSizing.Characters(), "tokenizer" => JsonSerializer.Deserialize<ChunkSizing.Tokenizer>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown ChunkSizing discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, ChunkSizing value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case ChunkSizing.Characters _:
tag = "characters";
inner = null;
break; case ChunkSizing.Tokenizer v_tokenizer:
tag = "tokenizer"; inner = v_tokenizer; break; default:
throw new JsonException($"Unknown ChunkSizing variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

136
packages/csharp/src/Kreuzberg/ChunkType.cs generated Normal file
View File

@@ -0,0 +1,136 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Semantic structural classification of a text chunk.
///
/// Assigned by the heuristic classifier in `chunking.classifier`.
/// Defaults to `Unknown` when no rule matches.
/// Designed to be extended in future versions without breaking changes.
/// </summary>
[JsonConverter(typeof(ChunkTypeJsonConverter))]
public enum ChunkType
{
/// <summary>
/// Section heading or document title.
/// </summary>
[JsonPropertyName("heading")]
Heading,
/// <summary>
/// Party list: names, addresses, and signatories.
/// </summary>
[JsonPropertyName("party_list")]
PartyList,
/// <summary>
/// Definition clause ("X means…", "X shall mean…").
/// </summary>
[JsonPropertyName("definitions")]
Definitions,
/// <summary>
/// Operative clause containing legal/contractual action verbs.
/// </summary>
[JsonPropertyName("operative_clause")]
OperativeClause,
/// <summary>
/// Signature block with signatures, names, and dates.
/// </summary>
[JsonPropertyName("signature_block")]
SignatureBlock,
/// <summary>
/// Schedule, annex, appendix, or exhibit section.
/// </summary>
[JsonPropertyName("schedule")]
Schedule,
/// <summary>
/// Table-like content with aligned columns or repeated patterns.
/// </summary>
[JsonPropertyName("table_like")]
TableLike,
/// <summary>
/// Mathematical formula or equation.
/// </summary>
[JsonPropertyName("formula")]
Formula,
/// <summary>
/// Code block or preformatted content.
/// </summary>
[JsonPropertyName("code_block")]
CodeBlock,
/// <summary>
/// Embedded or referenced image content.
/// </summary>
[JsonPropertyName("image")]
Image,
/// <summary>
/// Organizational chart or hierarchy diagram.
/// </summary>
[JsonPropertyName("org_chart")]
OrgChart,
/// <summary>
/// Diagram, figure, or visual illustration.
/// </summary>
[JsonPropertyName("diagram")]
Diagram,
/// <summary>
/// Unclassified or mixed content.
/// </summary>
[JsonPropertyName("unknown")]
Unknown,
}
/// <summary>
/// Custom JSON converter for <see cref="ChunkType"/> that respects explicit variant names.
/// </summary>
internal sealed class ChunkTypeJsonConverter : JsonConverter<ChunkType>
{
public override ChunkType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"heading" => ChunkType.Heading,
"party_list" => ChunkType.PartyList,
"definitions" => ChunkType.Definitions,
"operative_clause" => ChunkType.OperativeClause,
"signature_block" => ChunkType.SignatureBlock,
"schedule" => ChunkType.Schedule,
"table_like" => ChunkType.TableLike,
"formula" => ChunkType.Formula,
"code_block" => ChunkType.CodeBlock,
"image" => ChunkType.Image,
"org_chart" => ChunkType.OrgChart,
"diagram" => ChunkType.Diagram,
"unknown" => ChunkType.Unknown,
_ => throw new JsonException($"Unknown ChunkType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ChunkType value, JsonSerializerOptions options)
{
var str = value switch
{
ChunkType.Heading => "heading",
ChunkType.PartyList => "party_list",
ChunkType.Definitions => "definitions",
ChunkType.OperativeClause => "operative_clause",
ChunkType.SignatureBlock => "signature_block",
ChunkType.Schedule => "schedule",
ChunkType.TableLike => "table_like",
ChunkType.Formula => "formula",
ChunkType.CodeBlock => "code_block",
ChunkType.Image => "image",
ChunkType.OrgChart => "org_chart",
ChunkType.Diagram => "diagram",
ChunkType.Unknown => "unknown",
_ => throw new JsonException($"Unknown ChunkType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,70 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Type of text chunker to use.
///
/// # Variants
///
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
/// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
/// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
/// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
/// lower = more splits). Without an embedding, falls back to a
/// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
/// blank-line paragraphs) and merges groups into chunks capped at
/// `max_characters` (default 1000). `topic_threshold` has no effect in the
/// fallback path. For best results, pair with an embedding model.
/// </summary>
[JsonConverter(typeof(ChunkerTypeJsonConverter))]
public enum ChunkerType
{
[JsonPropertyName("text")]
Text,
[JsonPropertyName("markdown")]
Markdown,
[JsonPropertyName("yaml")]
Yaml,
[JsonPropertyName("semantic")]
Semantic,
}
/// <summary>
/// Custom JSON converter for <see cref="ChunkerType"/> that respects explicit variant names.
/// </summary>
internal sealed class ChunkerTypeJsonConverter : JsonConverter<ChunkerType>
{
public override ChunkerType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"text" => ChunkerType.Text,
"markdown" => ChunkerType.Markdown,
"yaml" => ChunkerType.Yaml,
"semantic" => ChunkerType.Semantic,
_ => throw new JsonException($"Unknown ChunkerType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ChunkerType value, JsonSerializerOptions options)
{
var str = value switch
{
ChunkerType.Text => "text",
ChunkerType.Markdown => "markdown",
ChunkerType.Yaml => "yaml",
ChunkerType.Semantic => "semantic",
_ => throw new JsonException($"Unknown ChunkerType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,151 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Chunking configuration.
///
/// Configures text chunking for document content, including chunk size,
/// overlap, trimming behavior, and optional embeddings.
///
/// Use `..Default.default()` when constructing to allow for future field additions:
/// </summary>
public sealed record ChunkingConfig
{
/// <summary>
/// Maximum size per chunk (in units determined by `sizing`).
///
/// When `sizing` is `Characters` (default), this is the max character count.
/// When using token-based sizing, this is the max token count.
///
/// Default: 1000
/// </summary>
[JsonPropertyName("max_chars")]
public ulong MaxCharacters { get; init; } = 1000;
/// <summary>
/// Overlap between chunks (in units determined by `sizing`).
///
/// Default: 200
/// </summary>
[JsonPropertyName("max_overlap")]
public ulong Overlap { get; init; } = 200;
/// <summary>
/// Whether to trim whitespace from chunk boundaries.
///
/// Default: true
/// </summary>
[JsonPropertyName("trim")]
public bool Trim { get; init; } = true;
/// <summary>
/// Type of chunker to use (Text or Markdown).
///
/// Default: Text
/// </summary>
[JsonPropertyName("chunker_type")]
public ChunkerType ChunkerType { get; init; } = ChunkerType.Text;
/// <summary>
/// Optional embedding configuration for chunk embeddings.
/// </summary>
[JsonPropertyName("embedding")]
public EmbeddingConfig? Embedding { get; init; } = null;
/// <summary>
/// Use a preset configuration (overrides individual settings if provided).
/// </summary>
[JsonPropertyName("preset")]
public string? Preset { get; init; } = null;
/// <summary>
/// How to measure chunk size.
///
/// Default: `Characters` (Unicode character count).
/// Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
/// </summary>
[JsonPropertyName("sizing")]
public ChunkSizing? Sizing { get; init; } = null;
/// <summary>
/// When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
/// path (e.g. `"# Title &gt; ## Section\n\n"`) to each chunk's content string.
///
/// This is useful for RAG pipelines where each chunk needs self-contained
/// context about its position in the document structure.
///
/// Default: `false`
/// </summary>
[JsonPropertyName("prepend_heading_context")]
public bool PrependHeadingContext { get; init; } = false;
/// <summary>
/// Optional cosine similarity threshold for semantic topic boundary detection.
///
/// Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
/// provided. You almost never need to set this. When omitted, defaults to
/// `0.75` which works well for most documents. Lower values detect more
/// topic boundaries (more, smaller chunks); higher values detect fewer.
/// Range: `0.0..=1.0`.
/// </summary>
[JsonPropertyName("topic_threshold")]
public float? TopicThreshold { get; init; } = null;
/// <summary>
/// Parse a <see cref="ChunkingConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ChunkingConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ChunkingConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ChunkingConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ChunkingConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ChunkingConfig Default()
{
var nativeResult = NativeMethods.ChunkingConfigDefault();
var jsonPtr = NativeMethods.ChunkingConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ChunkingConfigFree(nativeResult);
return JsonSerializer.Deserialize<ChunkingConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Citation file metadata (RIS, PubMed, EndNote).
/// </summary>
public sealed record CitationMetadata
{
[JsonPropertyName("citation_count")]
public ulong CitationCount { get; init; } = 0;
[JsonPropertyName("format")]
public string? Format { get; init; } = null;
[JsonPropertyName("authors")]
public List<string> Authors { get; init; } = [];
[JsonPropertyName("year_range")]
public YearRange? YearRange { get; init; } = null;
[JsonPropertyName("dois")]
public List<string> Dois { get; init; } = [];
[JsonPropertyName("keywords")]
public List<string> Keywords { get; init; } = [];
/// <summary>
/// Parse a <see cref="CitationMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CitationMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CitationMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CitationMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CitationMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Content rendering mode for code extraction.
///
/// Controls how extracted code content is represented in the `content` field
/// of `ExtractionResult`.
/// </summary>
[JsonConverter(typeof(CodeContentModeJsonConverter))]
public enum CodeContentMode
{
/// <summary>
/// Use TSLP semantic chunks as content (default).
/// </summary>
[JsonPropertyName("chunks")]
Chunks,
/// <summary>
/// Use raw source code as content.
/// </summary>
[JsonPropertyName("raw")]
Raw,
/// <summary>
/// Emit function/class headings + docstrings (no code bodies).
/// </summary>
[JsonPropertyName("structure")]
Structure,
}
/// <summary>
/// Custom JSON converter for <see cref="CodeContentMode"/> that respects explicit variant names.
/// </summary>
internal sealed class CodeContentModeJsonConverter : JsonConverter<CodeContentMode>
{
public override CodeContentMode Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"chunks" => CodeContentMode.Chunks,
"raw" => CodeContentMode.Raw,
"structure" => CodeContentMode.Structure,
_ => throw new JsonException($"Unknown CodeContentMode value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, CodeContentMode value, JsonSerializerOptions options)
{
var str = value switch
{
CodeContentMode.Chunks => "chunks",
CodeContentMode.Raw => "raw",
CodeContentMode.Structure => "structure",
_ => throw new JsonException($"Unknown CodeContentMode value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,132 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Cross-extractor content filtering configuration.
///
/// Controls whether "furniture" content (headers, footers, page numbers,
/// watermarks, repeating text) is included in or stripped from extraction
/// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
/// with format-specific implementation.
///
/// When `None` on `ExtractionConfig`, each extractor uses its current
/// default behavior unchanged.
/// </summary>
public sealed record ContentFilterConfig
{
/// <summary>
/// Include running headers in extraction output.
///
/// - PDF: Disables top-margin furniture stripping and prevents the layout
/// model from treating `PageHeader`-classified regions as furniture.
/// - DOCX: Includes document headers in text output.
/// - RTF/ODT: Headers already included; this is a no-op when true.
/// - HTML/EPUB: Keeps `&lt;header&gt;` element content.
///
/// Default: `false` (headers are stripped or excluded).
/// </summary>
[JsonPropertyName("include_headers")]
public bool IncludeHeaders { get; init; } = false;
/// <summary>
/// Include running footers in extraction output.
///
/// - PDF: Disables bottom-margin furniture stripping and prevents the layout
/// model from treating `PageFooter`-classified regions as furniture.
/// - DOCX: Includes document footers in text output.
/// - RTF/ODT: Footers already included; this is a no-op when true.
/// - HTML/EPUB: Keeps `&lt;footer&gt;` element content.
///
/// Default: `false` (footers are stripped or excluded).
/// </summary>
[JsonPropertyName("include_footers")]
public bool IncludeFooters { get; init; } = false;
/// <summary>
/// Enable the heuristic cross-page repeating text detector.
///
/// When `true` (default), text that repeats verbatim across a supermajority
/// of pages is classified as furniture and stripped. Disable this if brand
/// names or repeated headings are being incorrectly removed by the heuristic.
///
/// Note: when a layout-detection model is active, the model may independently
/// classify page-header / page-footer regions as furniture on a per-page basis.
/// To preserve those regions, set `include_headers = true`, `include_footers = true`,
/// or both, in addition to disabling this flag.
///
/// Primarily affects PDF extraction.
///
/// Default: `true`.
/// </summary>
[JsonPropertyName("strip_repeating_text")]
public bool StripRepeatingText { get; init; } = true;
/// <summary>
/// Include watermark text in extraction output.
///
/// - PDF: Keeps watermark artifacts and arXiv identifiers.
/// - Other formats: No effect currently.
///
/// Default: `false` (watermarks are stripped).
/// </summary>
[JsonPropertyName("include_watermarks")]
public bool IncludeWatermarks { get; init; } = false;
/// <summary>
/// Parse a <see cref="ContentFilterConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ContentFilterConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ContentFilterConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ContentFilterConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ContentFilterConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ContentFilterConfig Default()
{
var nativeResult = NativeMethods.ContentFilterConfigDefault();
var jsonPtr = NativeMethods.ContentFilterConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ContentFilterConfigFree(nativeResult);
return JsonSerializer.Deserialize<ContentFilterConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Content layer classification for document nodes.
///
/// Replaces separate body/furniture arrays with per-node granularity.
/// </summary>
[JsonConverter(typeof(ContentLayerJsonConverter))]
public enum ContentLayer
{
/// <summary>
/// Main document body content.
/// </summary>
[JsonPropertyName("body")]
Body,
/// <summary>
/// Page/section header (running header).
/// </summary>
[JsonPropertyName("header")]
Header,
/// <summary>
/// Page/section footer (running footer).
/// </summary>
[JsonPropertyName("footer")]
Footer,
/// <summary>
/// Footnote content.
/// </summary>
[JsonPropertyName("footnote")]
Footnote,
}
/// <summary>
/// Custom JSON converter for <see cref="ContentLayer"/> that respects explicit variant names.
/// </summary>
internal sealed class ContentLayerJsonConverter : JsonConverter<ContentLayer>
{
public override ContentLayer Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"body" => ContentLayer.Body,
"header" => ContentLayer.Header,
"footer" => ContentLayer.Footer,
"footnote" => ContentLayer.Footnote,
_ => throw new JsonException($"Unknown ContentLayer value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ContentLayer value, JsonSerializerOptions options)
{
var str = value switch
{
ContentLayer.Body => "body",
ContentLayer.Header => "header",
ContentLayer.Footer => "footer",
ContentLayer.Footnote => "footnote",
_ => throw new JsonException($"Unknown ContentLayer value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// JATS contributor with role.
/// </summary>
public sealed record ContributorRole
{
[JsonPropertyName("name")]
public required string Name { get; init; }
[JsonPropertyName("role")]
public string? Role { get; init; } = null;
/// <summary>
/// Parse a <see cref="ContributorRole"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ContributorRole FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ContributorRole>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ContributorRole from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ContributorRole from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,149 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Dublin Core metadata from docProps/core.xml
///
/// Contains standard metadata fields defined by the Dublin Core standard
/// and Office-specific extensions.
/// </summary>
public sealed record CoreProperties
{
/// <summary>
/// Document title
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Document subject/topic
/// </summary>
[JsonPropertyName("subject")]
public string? Subject { get; init; } = null;
/// <summary>
/// Document creator/author
/// </summary>
[JsonPropertyName("creator")]
public string? Creator { get; init; } = null;
/// <summary>
/// Keywords or tags
/// </summary>
[JsonPropertyName("keywords")]
public string? Keywords { get; init; } = null;
/// <summary>
/// Document description/abstract
/// </summary>
[JsonPropertyName("description")]
public string? Description { get; init; } = null;
/// <summary>
/// User who last modified the document
/// </summary>
[JsonPropertyName("last_modified_by")]
public string? LastModifiedBy { get; init; } = null;
/// <summary>
/// Revision number
/// </summary>
[JsonPropertyName("revision")]
public string? Revision { get; init; } = null;
/// <summary>
/// Creation timestamp (ISO 8601)
/// </summary>
[JsonPropertyName("created")]
public string? Created { get; init; } = null;
/// <summary>
/// Last modification timestamp (ISO 8601)
/// </summary>
[JsonPropertyName("modified")]
public string? Modified { get; init; } = null;
/// <summary>
/// Document category
/// </summary>
[JsonPropertyName("category")]
public string? Category { get; init; } = null;
/// <summary>
/// Content status (Draft, Final, etc.)
/// </summary>
[JsonPropertyName("content_status")]
public string? ContentStatus { get; init; } = null;
/// <summary>
/// Document language
/// </summary>
[JsonPropertyName("language")]
public string? Language { get; init; } = null;
/// <summary>
/// Unique identifier
/// </summary>
[JsonPropertyName("identifier")]
public string? Identifier { get; init; } = null;
/// <summary>
/// Document version
/// </summary>
[JsonPropertyName("version")]
public string? Version { get; init; } = null;
/// <summary>
/// Last print timestamp (ISO 8601)
/// </summary>
[JsonPropertyName("last_printed")]
public string? LastPrinted { get; init; } = null;
/// <summary>
/// Parse a <see cref="CoreProperties"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CoreProperties FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CoreProperties>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CoreProperties from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CoreProperties from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// CSV/TSV file metadata.
/// </summary>
public sealed record CsvMetadata
{
[JsonPropertyName("row_count")]
public uint RowCount { get; init; } = 0;
[JsonPropertyName("column_count")]
public uint ColumnCount { get; init; } = 0;
[JsonPropertyName("delimiter")]
public string? Delimiter { get; init; } = null;
[JsonPropertyName("has_header")]
public bool HasHeader { get; init; } = false;
[JsonPropertyName("column_types")]
public List<string>? ColumnTypes { get; init; } = null;
/// <summary>
/// Parse a <see cref="CsvMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static CsvMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<CsvMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse CsvMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse CsvMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// dBASE field information.
/// </summary>
public sealed record DbfFieldInfo
{
[JsonPropertyName("name")]
public required string Name { get; init; }
[JsonPropertyName("field_type")]
public required string FieldType { get; init; }
/// <summary>
/// Parse a <see cref="DbfFieldInfo"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DbfFieldInfo FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DbfFieldInfo>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DbfFieldInfo from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DbfFieldInfo from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// dBASE (DBF) file metadata.
/// </summary>
public sealed record DbfMetadata
{
[JsonPropertyName("record_count")]
public ulong RecordCount { get; init; } = 0;
[JsonPropertyName("field_count")]
public ulong FieldCount { get; init; } = 0;
[JsonPropertyName("fields")]
public List<DbfFieldInfo> Fields { get; init; } = [];
/// <summary>
/// Parse a <see cref="DbfMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DbfMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DbfMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DbfMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DbfMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// MIME type detection response.
/// </summary>
public sealed record DetectResponse
{
/// <summary>
/// Detected MIME type
/// </summary>
[JsonPropertyName("mime_type")]
public required string MimeType { get; init; }
/// <summary>
/// Original filename (if provided)
/// </summary>
[JsonPropertyName("filename")]
public string? Filename { get; init; } = null;
/// <summary>
/// Parse a <see cref="DetectResponse"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DetectResponse FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DetectResponse>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DetectResponse from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DetectResponse from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Page-level detection result containing all detections and page metadata.
/// </summary>
public sealed record DetectionResult
{
[JsonPropertyName("page_width")]
public uint PageWidth { get; init; } = 0;
[JsonPropertyName("page_height")]
public uint PageHeight { get; init; } = 0;
[JsonPropertyName("detections")]
public List<LayoutDetection> Detections { get; init; } = [];
/// <summary>
/// Parse a <see cref="DetectionResult"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DetectionResult FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DetectionResult>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DetectionResult from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DetectionResult from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single contiguous hunk in a unified diff.
/// </summary>
public sealed record DiffHunk
{
/// <summary>
/// Starting line number in the old content (0-indexed).
/// </summary>
[JsonPropertyName("from_line")]
public ulong FromLine { get; init; } = 0;
/// <summary>
/// Number of lines from the old content in this hunk.
/// </summary>
[JsonPropertyName("from_count")]
public ulong FromCount { get; init; } = 0;
/// <summary>
/// Starting line number in the new content (0-indexed).
/// </summary>
[JsonPropertyName("to_line")]
public ulong ToLine { get; init; } = 0;
/// <summary>
/// Number of lines from the new content in this hunk.
/// </summary>
[JsonPropertyName("to_count")]
public ulong ToCount { get; init; } = 0;
/// <summary>
/// Lines that make up this hunk.
/// </summary>
[JsonPropertyName("lines")]
public List<DiffLine> Lines { get; init; } = [];
/// <summary>
/// Parse a <see cref="DiffHunk"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DiffHunk FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DiffHunk>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DiffHunk from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DiffHunk from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

168
packages/csharp/src/Kreuzberg/DiffLine.cs generated Normal file
View File

@@ -0,0 +1,168 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single line in a unified-diff hunk.
///
/// Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
/// reference it unconditionally, without requiring the `diff` Cargo feature.
/// `crate.diff` re-exports this type verbatim.
/// </summary>
[JsonConverter(typeof(DiffLineJsonConverter))]
public abstract record DiffLine
{
/// <summary>
/// Unchanged context line.
/// </summary>
public sealed record Context(
string Value
) : DiffLine;
/// <summary>
/// Line added in the "after" version.
/// </summary>
public sealed record Added(
string Value
) : DiffLine;
/// <summary>
/// Line removed from the "before" version.
/// </summary>
public sealed record Removed(
string Value
) : DiffLine;
/// <summary>Returns the Context data if this is a Context variant, otherwise null.</summary>
public string? AsContext => this is Context e ? e.Value : null;
/// <summary>Returns the Added data if this is a Added variant, otherwise null.</summary>
public string? AsAdded => this is Added e ? e.Value : null;
/// <summary>Returns the Removed data if this is a Removed variant, otherwise null.</summary>
public string? AsRemoved => this is Removed e ? e.Value : null;
}
/// <summary>
/// Custom converter for DiffLine sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (kind) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class DiffLineJsonConverter : JsonConverter<DiffLine>
{
public override DiffLine Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("kind", out var tagElement))
{
throw new JsonException($"Missing discriminator field: kind");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "kind")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "kind")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "context" => JsonSerializer.Deserialize<DiffLine.Context>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "added" => JsonSerializer.Deserialize<DiffLine.Added>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "removed" => JsonSerializer.Deserialize<DiffLine.Removed>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown DiffLine discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, DiffLine value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"kind":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field kind" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case DiffLine.Context v_context:
tag = "context"; inner = v_context.Value; break; case DiffLine.Added v_added:
tag = "added"; inner = v_added.Value; break; case DiffLine.Removed v_removed:
tag = "removed"; inner = v_removed.Value; break; default:
throw new JsonException($"Unknown DiffLine variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("kind", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,87 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Options controlling how two `ExtractionResult` values are compared.
/// </summary>
public sealed record DiffOptions
{
/// <summary>
/// Include metadata changes in the diff. Default: `true`.
/// </summary>
[JsonPropertyName("include_metadata")]
public bool IncludeMetadata { get; init; } = true;
/// <summary>
/// Include embedded-children changes in the diff. Default: `true`.
/// </summary>
[JsonPropertyName("include_embedded")]
public bool IncludeEmbedded { get; init; } = true;
/// <summary>
/// Truncate content to this many characters before diffing.
///
/// Useful for very large documents where only the first N characters matter.
/// `None` means no truncation.
/// </summary>
[JsonPropertyName("max_content_chars")]
public ulong? MaxContentChars { get; init; } = null;
/// <summary>
/// Parse a <see cref="DiffOptions"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DiffOptions FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DiffOptions>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DiffOptions from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DiffOptions from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static DiffOptions Default()
{
var nativeResult = NativeMethods.DiffOptionsDefault();
var jsonPtr = NativeMethods.DiffOptionsToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.DiffOptionsFree(nativeResult);
return JsonSerializer.Deserialize<DiffOptions>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,114 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Comprehensive Djot document structure with semantic preservation.
///
/// This type captures the full richness of Djot markup, including:
/// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
/// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
/// - Attributes (classes, IDs, key-value pairs)
/// - Links, images, footnotes
/// - Math expressions (inline and display)
/// - Tables with full structure
///
/// Available when the `djot` feature is enabled.
/// </summary>
public sealed record DjotContent
{
/// <summary>
/// Plain text representation for backwards compatibility
/// </summary>
[JsonPropertyName("plain_text")]
public required string PlainText { get; init; }
/// <summary>
/// Structured block-level content
/// </summary>
[JsonPropertyName("blocks")]
public List<FormattedBlock> Blocks { get; init; } = [];
/// <summary>
/// Metadata from YAML frontmatter
/// </summary>
[JsonPropertyName("metadata")]
public required Metadata Metadata { get; init; }
/// <summary>
/// Extracted tables as structured data
/// </summary>
[JsonPropertyName("tables")]
public List<Table> Tables { get; init; } = [];
/// <summary>
/// Extracted images with metadata
/// </summary>
[JsonPropertyName("images")]
public List<DjotImage> Images { get; init; } = [];
/// <summary>
/// Extracted links with URLs
/// </summary>
[JsonPropertyName("links")]
public List<DjotLink> Links { get; init; } = [];
/// <summary>
/// Footnote definitions
/// </summary>
[JsonPropertyName("footnotes")]
public List<Footnote> Footnotes { get; init; } = [];
/// <summary>
/// Attributes mapped by element identifier (if present)
/// </summary>
[JsonPropertyName("attributes")]
public List<string>? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DjotContent"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DjotContent FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DjotContent>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DjotContent from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DjotContent from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,80 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image element in Djot.
/// </summary>
public sealed record DjotImage
{
/// <summary>
/// Image source URL or path
/// </summary>
[JsonPropertyName("src")]
public required string Src { get; init; }
/// <summary>
/// Alternative text
/// </summary>
[JsonPropertyName("alt")]
public required string Alt { get; init; }
/// <summary>
/// Optional title
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Element attributes
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DjotImage"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DjotImage FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DjotImage>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DjotImage from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DjotImage from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,80 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Link element in Djot.
/// </summary>
public sealed record DjotLink
{
/// <summary>
/// Link URL
/// </summary>
[JsonPropertyName("url")]
public required string Url { get; init; }
/// <summary>
/// Link text content
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// Optional title
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Element attributes
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DjotLink"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DjotLink FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DjotLink>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DjotLink from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DjotLink from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,124 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single node in the document tree.
///
/// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
/// for tree structure, and metadata like page number, bounding box, and content layer.
/// </summary>
public sealed record DocumentNode
{
/// <summary>
/// Deterministic identifier (hash of content + position).
/// </summary>
[JsonPropertyName("id")]
public required string Id { get; init; }
/// <summary>
/// Node content — tagged enum, type-specific data only.
/// </summary>
[JsonPropertyName("content")]
public required NodeContent Content { get; init; }
/// <summary>
/// Parent node index (`None` = root-level node).
/// </summary>
[JsonPropertyName("parent")]
public uint? Parent { get; init; } = null;
/// <summary>
/// Child node indices in reading order.
/// </summary>
[JsonPropertyName("children")]
public List<uint> Children { get; init; } = [];
/// <summary>
/// Content layer classification.
/// </summary>
[JsonPropertyName("content_layer")]
public ContentLayer? ContentLayer { get; init; } = null;
/// <summary>
/// Page number where this node starts (1-indexed).
/// </summary>
[JsonPropertyName("page")]
public uint? Page { get; init; } = null;
/// <summary>
/// Page number where this node ends (for multi-page tables/sections).
/// </summary>
[JsonPropertyName("page_end")]
public uint? PageEnd { get; init; } = null;
/// <summary>
/// Bounding box in document coordinates.
/// </summary>
[JsonPropertyName("bbox")]
public BoundingBox? Bbox { get; init; } = null;
/// <summary>
/// Inline annotations (formatting, links) on this node's text content.
///
/// Only meaningful for text-carrying nodes; empty for containers.
/// </summary>
[JsonPropertyName("annotations")]
public List<TextAnnotation> Annotations { get; init; } = [];
/// <summary>
/// Format-specific key-value attributes.
///
/// Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
/// LaTeX environment names, Excel cell formulas, slide layout names, etc.
/// </summary>
[JsonPropertyName("attributes")]
public Dictionary<string, string>? Attributes { get; init; } = null;
/// <summary>
/// Parse a <see cref="DocumentNode"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentNode FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentNode>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentNode from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentNode from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A resolved relationship between two nodes in the document tree.
/// </summary>
public sealed record DocumentRelationship
{
/// <summary>
/// Source node index (the referencing node).
/// </summary>
[JsonPropertyName("source")]
public uint Source { get; init; } = 0;
/// <summary>
/// Target node index (the referenced node).
/// </summary>
[JsonPropertyName("target")]
public uint Target { get; init; } = 0;
/// <summary>
/// Semantic kind of the relationship.
/// </summary>
[JsonPropertyName("kind")]
public required RelationshipKind Kind { get; init; }
/// <summary>
/// Parse a <see cref="DocumentRelationship"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentRelationship FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentRelationship>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentRelationship from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentRelationship from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,110 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single tracked change embedded in a document.
///
/// Populated by per-format extractors that understand change-tracking metadata
/// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
/// extractor defaults to `ExtractionResult.revisions = None` until a
/// format-specific implementation is added.
/// </summary>
public sealed record DocumentRevision
{
/// <summary>
/// Format-specific revision identifier.
///
/// For DOCX this is the `w:id` attribute value on the change element
/// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
/// generated (`"docx-ins-0"`, `"docx-del-3"`, …).
/// </summary>
[JsonPropertyName("revision_id")]
public required string RevisionId { get; init; }
/// <summary>
/// Display name of the author who made this change, when available.
/// </summary>
[JsonPropertyName("author")]
public string? Author { get; init; } = null;
/// <summary>
/// ISO-8601 timestamp of the change, when available.
///
/// Stored as a plain string so this type remains FFI-friendly and
/// unconditionally available without the `chrono` optional dep.
/// DOCX populates this from the `w:date` attribute (e.g.
/// `"2024-03-15T10:30:00Z"`).
/// </summary>
[JsonPropertyName("timestamp")]
public string? Timestamp { get; init; } = null;
/// <summary>
/// Semantic kind of this revision.
/// </summary>
[JsonPropertyName("kind")]
public required RevisionKind Kind { get; init; }
/// <summary>
/// Best-effort document location for this revision.
///
/// Resolution is format-dependent and may be `None` when the location
/// cannot be determined (e.g. changes inside table cells before
/// table-cell anchor support is added).
/// </summary>
[JsonPropertyName("anchor")]
public RevisionAnchor? Anchor { get; init; } = null;
/// <summary>
/// The content changes that make up this revision.
/// </summary>
[JsonPropertyName("delta")]
public required RevisionDelta Delta { get; init; }
/// <summary>
/// Parse a <see cref="DocumentRevision"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentRevision FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentRevision>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentRevision from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentRevision from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,112 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Top-level structured document representation.
///
/// A flat array of nodes with index-based parent/child references forming a tree.
/// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
/// to iterate over top-level content by layer.
///
/// # Validation
///
/// Call `validate()` after construction to verify all node indices are in bounds
/// and parent-child relationships are bidirectionally consistent.
/// </summary>
public sealed record DocumentStructure
{
/// <summary>
/// All nodes in document/reading order.
/// </summary>
[JsonPropertyName("nodes")]
public List<DocumentNode> Nodes { get; init; } = [];
/// <summary>
/// Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
///
/// Allows renderers to apply format-aware heuristics when converting
/// the document tree to output formats.
/// </summary>
[JsonPropertyName("source_format")]
public string? SourceFormat { get; init; } = null;
/// <summary>
/// Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
///
/// Populated during derivation from the internal document representation.
/// Empty when no relationships are detected.
/// </summary>
[JsonPropertyName("relationships")]
public List<DocumentRelationship> Relationships { get; init; } = [];
/// <summary>
/// Sorted, deduplicated list of node type names present in this document.
///
/// Each value is the snake_case `node_type` tag of the corresponding
/// `NodeContent` variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
///
/// Computed from `nodes` via `DocumentStructure.finalize_node_types`.
/// Empty until that method is called (internal construction paths call it
/// at the end of derivation).
/// </summary>
[JsonPropertyName("node_types")]
public List<string> NodeTypes { get; init; } = [];
/// <summary>
/// Parse a <see cref="DocumentStructure"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocumentStructure FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocumentStructure>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocumentStructure from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocumentStructure from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static DocumentStructure Default()
{
var nativeResult = NativeMethods.DocumentStructureDefault();
var jsonPtr = NativeMethods.DocumentStructureToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.DocumentStructureFree(nativeResult);
return JsonSerializer.Deserialize<DocumentStructure>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,154 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Application properties from docProps/app.xml for DOCX
///
/// Contains Word-specific document statistics and metadata.
/// </summary>
public sealed record DocxAppProperties
{
/// <summary>
/// Application name (e.g., "Microsoft Office Word")
/// </summary>
[JsonPropertyName("application")]
public string? Application { get; init; } = null;
/// <summary>
/// Application version
/// </summary>
[JsonPropertyName("app_version")]
public string? AppVersion { get; init; } = null;
/// <summary>
/// Template filename
/// </summary>
[JsonPropertyName("template")]
public string? Template { get; init; } = null;
/// <summary>
/// Total editing time in minutes
/// </summary>
[JsonPropertyName("total_time")]
public int? TotalTime { get; init; } = null;
/// <summary>
/// Number of pages
/// </summary>
[JsonPropertyName("pages")]
public int? Pages { get; init; } = null;
/// <summary>
/// Number of words
/// </summary>
[JsonPropertyName("words")]
public int? Words { get; init; } = null;
/// <summary>
/// Number of characters (excluding spaces)
/// </summary>
[JsonPropertyName("characters")]
public int? Characters { get; init; } = null;
/// <summary>
/// Number of characters (including spaces)
/// </summary>
[JsonPropertyName("characters_with_spaces")]
public int? CharactersWithSpaces { get; init; } = null;
/// <summary>
/// Number of lines
/// </summary>
[JsonPropertyName("lines")]
public int? Lines { get; init; } = null;
/// <summary>
/// Number of paragraphs
/// </summary>
[JsonPropertyName("paragraphs")]
public int? Paragraphs { get; init; } = null;
/// <summary>
/// Company name
/// </summary>
[JsonPropertyName("company")]
public string? Company { get; init; } = null;
/// <summary>
/// Document security level
/// </summary>
[JsonPropertyName("doc_security")]
public int? DocSecurity { get; init; } = null;
/// <summary>
/// Scale crop flag
/// </summary>
[JsonPropertyName("scale_crop")]
public bool? ScaleCrop { get; init; } = null;
/// <summary>
/// Links up to date flag
/// </summary>
[JsonPropertyName("links_up_to_date")]
public bool? LinksUpToDate { get; init; } = null;
/// <summary>
/// Shared document flag
/// </summary>
[JsonPropertyName("shared_doc")]
public bool? SharedDoc { get; init; } = null;
/// <summary>
/// Hyperlinks changed flag
/// </summary>
[JsonPropertyName("hyperlinks_changed")]
public bool? HyperlinksChanged { get; init; } = null;
/// <summary>
/// Parse a <see cref="DocxAppProperties"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocxAppProperties FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocxAppProperties>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocxAppProperties from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocxAppProperties from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Word document metadata.
///
/// Extracted from DOCX files using shared Office Open XML metadata extraction.
/// Integrates with `office_metadata` module for core/app/custom properties.
/// </summary>
public sealed record DocxMetadata
{
/// <summary>
/// Core properties from docProps/core.xml (Dublin Core metadata)
///
/// Contains title, creator, subject, keywords, dates, etc.
/// Shared format across DOCX/PPTX/XLSX documents.
/// </summary>
[JsonPropertyName("core_properties")]
public CoreProperties? CoreProperties { get; init; } = null;
/// <summary>
/// Application properties from docProps/app.xml (Word-specific statistics)
///
/// Contains word count, page count, paragraph count, editing time, etc.
/// DOCX-specific variant of Office application properties.
/// </summary>
[JsonPropertyName("app_properties")]
public DocxAppProperties? AppProperties { get; init; } = null;
/// <summary>
/// Custom properties from docProps/custom.xml (user-defined properties)
///
/// Contains key-value pairs defined by users or applications.
/// Values can be strings, numbers, booleans, or dates.
/// </summary>
[JsonPropertyName("custom_properties")]
public Dictionary<string, string>? CustomProperties { get; init; } = null;
/// <summary>
/// Parse a <see cref="DocxMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static DocxMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<DocxMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse DocxMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse DocxMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

83
packages/csharp/src/Kreuzberg/Element.cs generated Normal file
View File

@@ -0,0 +1,83 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Semantic element extracted from document.
///
/// Represents a logical unit of content with semantic classification,
/// unique identifier, and metadata for tracking origin and position.
/// </summary>
public sealed record Element
{
/// <summary>
/// Unique element identifier
/// </summary>
[JsonPropertyName("element_id")]
public required string ElementId { get; init; }
/// <summary>
/// Semantic type of this element
/// </summary>
[JsonPropertyName("element_type")]
public required ElementType ElementType { get; init; }
/// <summary>
/// Text content of the element
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// Metadata about the element
/// </summary>
[JsonPropertyName("metadata")]
public required ElementMetadata Metadata { get; init; }
/// <summary>
/// Parse a <see cref="Element"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static Element FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<Element>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse Element from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse Element from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Metadata for a semantic element.
/// </summary>
public sealed record ElementMetadata
{
/// <summary>
/// Page number (1-indexed)
/// </summary>
[JsonPropertyName("page_number")]
public uint? PageNumber { get; init; } = null;
/// <summary>
/// Source filename or document name
/// </summary>
[JsonPropertyName("filename")]
public string? Filename { get; init; } = null;
/// <summary>
/// Bounding box coordinates if available
/// </summary>
[JsonPropertyName("coordinates")]
public BoundingBox? Coordinates { get; init; } = null;
/// <summary>
/// Position index in the element sequence
/// </summary>
[JsonPropertyName("element_index")]
public ulong? ElementIndex { get; init; } = null;
/// <summary>
/// Additional custom metadata
/// </summary>
[JsonPropertyName("additional")]
public Dictionary<string, string> Additional { get; init; } = default!;
/// <summary>
/// Parse a <see cref="ElementMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ElementMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ElementMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ElementMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ElementMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,121 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Semantic element type classification.
///
/// Categorizes text content into semantic units for downstream processing.
/// Supports the element types commonly found in Unstructured documents.
/// </summary>
[JsonConverter(typeof(ElementTypeJsonConverter))]
public enum ElementType
{
/// <summary>
/// Document title
/// </summary>
[JsonPropertyName("title")]
Title,
/// <summary>
/// Main narrative text body
/// </summary>
[JsonPropertyName("narrative_text")]
NarrativeText,
/// <summary>
/// Section heading
/// </summary>
[JsonPropertyName("heading")]
Heading,
/// <summary>
/// List item (bullet, numbered, etc.)
/// </summary>
[JsonPropertyName("list_item")]
ListItem,
/// <summary>
/// Table element
/// </summary>
[JsonPropertyName("table")]
Table,
/// <summary>
/// Image element
/// </summary>
[JsonPropertyName("image")]
Image,
/// <summary>
/// Page break marker
/// </summary>
[JsonPropertyName("page_break")]
PageBreak,
/// <summary>
/// Code block
/// </summary>
[JsonPropertyName("code_block")]
CodeBlock,
/// <summary>
/// Block quote
/// </summary>
[JsonPropertyName("block_quote")]
BlockQuote,
/// <summary>
/// Footer text
/// </summary>
[JsonPropertyName("footer")]
Footer,
/// <summary>
/// Header text
/// </summary>
[JsonPropertyName("header")]
Header,
}
/// <summary>
/// Custom JSON converter for <see cref="ElementType"/> that respects explicit variant names.
/// </summary>
internal sealed class ElementTypeJsonConverter : JsonConverter<ElementType>
{
public override ElementType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"title" => ElementType.Title,
"narrative_text" => ElementType.NarrativeText,
"heading" => ElementType.Heading,
"list_item" => ElementType.ListItem,
"table" => ElementType.Table,
"image" => ElementType.Image,
"page_break" => ElementType.PageBreak,
"code_block" => ElementType.CodeBlock,
"block_quote" => ElementType.BlockQuote,
"footer" => ElementType.Footer,
"header" => ElementType.Header,
_ => throw new JsonException($"Unknown ElementType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ElementType value, JsonSerializerOptions options)
{
var str = value switch
{
ElementType.Title => "title",
ElementType.NarrativeText => "narrative_text",
ElementType.Heading => "heading",
ElementType.ListItem => "list_item",
ElementType.Table => "table",
ElementType.Image => "image",
ElementType.PageBreak => "page_break",
ElementType.CodeBlock => "code_block",
ElementType.BlockQuote => "block_quote",
ElementType.Footer => "footer",
ElementType.Header => "header",
_ => throw new JsonException($"Unknown ElementType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,95 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Email attachment representation.
///
/// Contains metadata and optionally the content of an email attachment.
/// </summary>
public sealed record EmailAttachment
{
/// <summary>
/// Attachment name (from Content-Disposition header)
/// </summary>
[JsonPropertyName("name")]
public string? Name { get; init; } = null;
/// <summary>
/// Filename of the attachment
/// </summary>
[JsonPropertyName("filename")]
public string? Filename { get; init; } = null;
/// <summary>
/// MIME type of the attachment
/// </summary>
[JsonPropertyName("mime_type")]
public string? MimeType { get; init; } = null;
/// <summary>
/// Size in bytes
/// </summary>
[JsonPropertyName("size")]
public ulong? Size { get; init; } = null;
/// <summary>
/// Whether this attachment is an image
/// </summary>
[JsonPropertyName("is_image")]
public bool IsImage { get; init; } = false;
/// <summary>
/// Attachment data (if extracted).
/// Uses `bytes.Bytes` for cheap cloning of large buffers.
/// </summary>
[JsonPropertyName("data")]
public byte[]? Data { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmailAttachment"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailAttachment FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailAttachment>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailAttachment from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailAttachment from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,79 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Configuration for email extraction.
/// </summary>
public sealed record EmailConfig
{
/// <summary>
/// Windows codepage number to use when an MSG file contains no codepage property.
/// Defaults to `None`, which falls back to windows-1252.
///
/// If an unrecognized or invalid codepage number is supplied (including 0),
/// the behavior silently falls back to windows-1252 — the same as when the
/// MSG file itself contains an unrecognized codepage. No error or warning is
/// emitted. Users should verify output when supplying unusual values.
///
/// Common values:
/// - 1250: Central European (Polish, Czech, Hungarian, etc.)
/// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
/// - 1252: Western European (default)
/// - 1253: Greek
/// - 1254: Turkish
/// - 1255: Hebrew
/// - 1256: Arabic
/// - 932: Japanese (Shift-JIS)
/// - 936: Simplified Chinese (GBK)
/// </summary>
[JsonPropertyName("msg_fallback_codepage")]
public uint? MsgFallbackCodepage { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmailConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,131 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Email extraction result.
///
/// Complete representation of an extracted email message (.eml or .msg)
/// including headers, body content, and attachments.
/// </summary>
public sealed record EmailExtractionResult
{
/// <summary>
/// Email subject line
/// </summary>
[JsonPropertyName("subject")]
public string? Subject { get; init; } = null;
/// <summary>
/// Sender email address
/// </summary>
[JsonPropertyName("from_email")]
public string? FromEmail { get; init; } = null;
/// <summary>
/// Primary recipient email addresses
/// </summary>
[JsonPropertyName("to_emails")]
public List<string> ToEmails { get; init; } = [];
/// <summary>
/// CC recipient email addresses
/// </summary>
[JsonPropertyName("cc_emails")]
public List<string> CcEmails { get; init; } = [];
/// <summary>
/// BCC recipient email addresses
/// </summary>
[JsonPropertyName("bcc_emails")]
public List<string> BccEmails { get; init; } = [];
/// <summary>
/// Email date/timestamp
/// </summary>
[JsonPropertyName("date")]
public string? Date { get; init; } = null;
/// <summary>
/// Message-ID header value
/// </summary>
[JsonPropertyName("message_id")]
public string? MessageId { get; init; } = null;
/// <summary>
/// Plain text version of the email body
/// </summary>
[JsonPropertyName("plain_text")]
public string? PlainText { get; init; } = null;
/// <summary>
/// HTML version of the email body
/// </summary>
[JsonPropertyName("html_content")]
public string? HtmlContent { get; init; } = null;
/// <summary>
/// Cleaned/processed text content. Aliased as `cleaned_text` for back-compat.
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// List of email attachments
/// </summary>
[JsonPropertyName("attachments")]
public List<EmailAttachment> Attachments { get; init; } = [];
/// <summary>
/// Additional email headers and metadata
/// </summary>
[JsonPropertyName("metadata")]
public Dictionary<string, string> Metadata { get; init; } = default!;
/// <summary>
/// Parse a <see cref="EmailExtractionResult"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailExtractionResult FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailExtractionResult>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailExtractionResult from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailExtractionResult from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,100 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Email metadata extracted from .eml and .msg files.
///
/// Includes sender/recipient information, message ID, and attachment list.
/// </summary>
public sealed record EmailMetadata
{
/// <summary>
/// Sender's email address
/// </summary>
[JsonPropertyName("from_email")]
public string? FromEmail { get; init; } = null;
/// <summary>
/// Sender's display name
/// </summary>
[JsonPropertyName("from_name")]
public string? FromName { get; init; } = null;
/// <summary>
/// Primary recipients
/// </summary>
[JsonPropertyName("to_emails")]
public List<string> ToEmails { get; init; } = [];
/// <summary>
/// CC recipients
/// </summary>
[JsonPropertyName("cc_emails")]
public List<string> CcEmails { get; init; } = [];
/// <summary>
/// BCC recipients
/// </summary>
[JsonPropertyName("bcc_emails")]
public List<string> BccEmails { get; init; } = [];
/// <summary>
/// Message-ID header value
/// </summary>
[JsonPropertyName("message_id")]
public string? MessageId { get; init; } = null;
/// <summary>
/// List of attachment filenames
/// </summary>
[JsonPropertyName("attachments")]
public List<string> Attachments { get; init; } = [];
/// <summary>
/// Parse a <see cref="EmailMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmailMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmailMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmailMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmailMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,76 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Changes to embedded archive children between two results.
/// </summary>
public sealed record EmbeddedChanges
{
/// <summary>
/// Children present in `b` but not in `a` (matched by `path`).
/// </summary>
[JsonPropertyName("added")]
public List<ArchiveEntry> Added { get; init; } = [];
/// <summary>
/// Children present in `a` but not in `b` (matched by `path`).
/// </summary>
[JsonPropertyName("removed")]
public List<ArchiveEntry> Removed { get; init; } = [];
/// <summary>
/// Children present in both but with differing content (matched by `path`).
///
/// Each entry holds the diff of the nested `ExtractionResult`.
/// </summary>
[JsonPropertyName("changed")]
public List<EmbeddedDiff> Changed { get; init; } = [];
/// <summary>
/// Parse a <see cref="EmbeddedChanges"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddedChanges FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddedChanges>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddedChanges from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddedChanges from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Diff for a single embedded archive entry that appears in both results.
/// </summary>
public sealed record EmbeddedDiff
{
/// <summary>
/// Archive-relative path identifying this entry.
/// </summary>
[JsonPropertyName("path")]
public required string Path { get; init; }
/// <summary>
/// The recursive diff of the entry's extraction result.
/// </summary>
[JsonPropertyName("diff")]
public required ExtractionDiff Diff { get; init; }
/// <summary>
/// Parse a <see cref="EmbeddedDiff"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddedDiff FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddedDiff>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddedDiff from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddedDiff from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Embedded file descriptor extracted from the PDF name tree.
/// </summary>
public sealed record EmbeddedFile
{
/// <summary>
/// The filename as stored in the PDF name tree.
/// </summary>
[JsonPropertyName("name")]
public required string Name { get; init; }
/// <summary>
/// Raw file bytes from the embedded stream (already decompressed by lopdf).
/// </summary>
[JsonConverter(typeof(ByteArrayToIntArrayConverter))]
[JsonPropertyName("data")]
public byte[] Data { get; init; } = [];
/// <summary>
/// Compressed byte count of the original stream (before decompression).
///
/// Used by callers to compute the decompression ratio and detect zip-bomb-style
/// attacks that embed a tiny compressed stream expanding to gigabytes of data.
/// </summary>
[JsonPropertyName("compressed_size")]
public ulong CompressedSize { get; init; } = 0;
/// <summary>
/// MIME type if specified in the filespec, otherwise `None`.
/// </summary>
[JsonPropertyName("mime_type")]
public string? MimeType { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmbeddedFile"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddedFile FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddedFile>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddedFile from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddedFile from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,127 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Embedding configuration for text chunks.
///
/// Configures embedding generation using ONNX models via the vendored embedding engine.
/// Requires the `embeddings` feature to be enabled.
/// </summary>
public sealed record EmbeddingConfig
{
/// <summary>
/// The embedding model to use (defaults to "balanced" preset if not specified)
/// </summary>
[JsonPropertyName("model")]
public EmbeddingModelType? Model { get; init; } = null;
/// <summary>
/// Whether to normalize embedding vectors (recommended for cosine similarity)
/// </summary>
[JsonPropertyName("normalize")]
public bool Normalize { get; init; } = true;
/// <summary>
/// Batch size for embedding generation
/// </summary>
[JsonPropertyName("batch_size")]
public ulong BatchSize { get; init; } = 32;
/// <summary>
/// Show model download progress
/// </summary>
[JsonPropertyName("show_download_progress")]
public bool ShowDownloadProgress { get; init; } = false;
/// <summary>
/// Custom cache directory for model files
///
/// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
/// Allows full customization of model download location.
/// </summary>
[JsonPropertyName("cache_dir")]
public string? CacheDir { get; init; } = null;
/// <summary>
/// Hardware acceleration for the embedding ONNX model.
///
/// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
/// is used for inference. Defaults to `None` (auto-select per platform).
/// </summary>
[JsonPropertyName("acceleration")]
public AccelerationConfig? Acceleration { get; init; } = null;
/// <summary>
/// Maximum wall-clock duration (in seconds) for a single `embed()` call when
/// using `EmbeddingModelType.Plugin`.
///
/// Applies only to the in-process plugin path — protects against hung
/// host-language backends (e.g. a Python callback deadlocked on the GIL,
/// a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
/// returns `Plugin` instead of blocking forever.
///
/// `None` disables the timeout. The default (60 seconds) is conservative
/// for common in-process inference; increase for large batches on slow
/// hardware.
/// </summary>
[JsonPropertyName("max_embed_duration_secs")]
public ulong? MaxEmbedDurationSecs { get; init; } = null;
/// <summary>
/// Parse a <see cref="EmbeddingConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddingConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddingConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddingConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddingConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static EmbeddingConfig Default()
{
var nativeResult = NativeMethods.EmbeddingConfigDefault();
var jsonPtr = NativeMethods.EmbeddingConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.EmbeddingConfigFree(nativeResult);
return JsonSerializer.Deserialize<EmbeddingConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class EmbeddingException : KreuzbergErrorException
{
public EmbeddingException(string message) : base(message) { }
public EmbeddingException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,185 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Embedding model types supported by Kreuzberg.
/// </summary>
[JsonConverter(typeof(EmbeddingModelTypeJsonConverter))]
public abstract record EmbeddingModelType
{
/// <summary>
/// Use a preset model configuration (recommended)
/// </summary>
public sealed record Preset(
[property: JsonPropertyName("name")] string Name
) : EmbeddingModelType;
/// <summary>
/// Use a custom ONNX model from HuggingFace
/// </summary>
public sealed record Custom(
[property: JsonPropertyName("model_id")] string ModelId,
[property: JsonPropertyName("dimensions")] ulong Dimensions
) : EmbeddingModelType;
/// <summary>
/// Provider-hosted embedding model via liter-llm.
///
/// Uses the model specified in the nested `LlmConfig` (e.g.,
/// `"openai/text-embedding-3-small"`).
/// </summary>
public sealed record Llm(
[property: JsonPropertyName("llm")] LlmConfig Value
) : EmbeddingModelType;
/// <summary>
/// In-process embedding backend registered via the plugin system.
///
/// The caller registers an `EmbeddingBackend`(crate.plugins.EmbeddingBackend) once
/// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
/// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
/// into the registered backend during chunking and standalone embed requests —
/// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
///
/// When this variant is selected, only the following `EmbeddingConfig` fields
/// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
/// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
/// `show_download_progress`, `acceleration`) are ignored — the host owns the
/// model lifecycle.
///
/// Semantic chunking falls back to `ChunkingConfig.max_characters` when this variant
/// is used, since there is no preset to look a chunk-size ceiling up against — size your
/// context window via `max_characters` directly.
///
/// See `register_embedding_backend`.
/// </summary>
public sealed record Plugin(
[property: JsonPropertyName("name")] string Name
) : EmbeddingModelType;
}
/// <summary>
/// Custom converter for EmbeddingModelType sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class EmbeddingModelTypeJsonConverter : JsonConverter<EmbeddingModelType>
{
public override EmbeddingModelType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "preset" => JsonSerializer.Deserialize<EmbeddingModelType.Preset>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "custom" => JsonSerializer.Deserialize<EmbeddingModelType.Custom>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "llm" => JsonSerializer.Deserialize<EmbeddingModelType.Llm>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), "plugin" => JsonSerializer.Deserialize<EmbeddingModelType.Plugin>(flatJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown EmbeddingModelType discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, EmbeddingModelType value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case EmbeddingModelType.Preset v_preset:
tag = "preset"; inner = v_preset; break; case EmbeddingModelType.Custom v_custom:
tag = "custom"; inner = v_custom; break; case EmbeddingModelType.Llm v_llm:
tag = "llm"; inner = v_llm; break; case EmbeddingModelType.Plugin v_plugin:
tag = "plugin"; inner = v_plugin; break; default:
throw new JsonException($"Unknown EmbeddingModelType variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,95 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Preset configurations for common RAG use cases.
///
/// Each preset combines chunk size, overlap, and embedding model
/// to provide an optimized configuration for specific scenarios.
///
/// All string fields are owned `String` for FFI compatibility — instances
/// are safe to clone and pass across language boundaries.
/// </summary>
public sealed record EmbeddingPreset
{
[JsonPropertyName("name")]
public required string Name { get; init; }
[JsonPropertyName("chunk_size")]
public ulong ChunkSize { get; init; } = 0;
[JsonPropertyName("overlap")]
public ulong Overlap { get; init; } = 0;
/// <summary>
/// HuggingFace repository name for the model.
/// </summary>
[JsonPropertyName("model_repo")]
public required string ModelRepo { get; init; }
/// <summary>
/// Pooling strategy: "cls" or "mean".
/// </summary>
[JsonPropertyName("pooling")]
public required string Pooling { get; init; }
/// <summary>
/// Path to the ONNX model file within the repo.
/// </summary>
[JsonPropertyName("model_file")]
public required string ModelFile { get; init; }
[JsonPropertyName("dimensions")]
public ulong Dimensions { get; init; } = 0;
[JsonPropertyName("description")]
public required string Description { get; init; }
/// <summary>
/// Parse a <see cref="EmbeddingPreset"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EmbeddingPreset FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EmbeddingPreset>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EmbeddingPreset from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EmbeddingPreset from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,74 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// EPUB metadata (Dublin Core extensions).
/// </summary>
public sealed record EpubMetadata
{
[JsonPropertyName("coverage")]
public string? Coverage { get; init; } = null;
[JsonPropertyName("dc_format")]
public string? DcFormat { get; init; } = null;
[JsonPropertyName("relation")]
public string? Relation { get; init; } = null;
[JsonPropertyName("source")]
public string? Source { get; init; } = null;
[JsonPropertyName("dc_type")]
public string? DcType { get; init; } = null;
[JsonPropertyName("cover_image")]
public string? CoverImage { get; init; } = null;
/// <summary>
/// Parse a <see cref="EpubMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static EpubMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<EpubMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse EpubMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse EpubMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,62 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Error metadata (for batch operations).
/// </summary>
public sealed record ErrorMetadata
{
[JsonPropertyName("error_type")]
public required string ErrorType { get; init; }
[JsonPropertyName("message")]
public required string Message { get; init; }
/// <summary>
/// Parse a <see cref="ErrorMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ErrorMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ErrorMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ErrorMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ErrorMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,71 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Excel/spreadsheet format metadata.
///
/// Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
/// discriminant. Sheet count and sheet names are stored inside this struct.
/// </summary>
public sealed record ExcelMetadata
{
/// <summary>
/// Number of sheets in the workbook.
/// </summary>
[JsonPropertyName("sheet_count")]
public uint? SheetCount { get; init; } = null;
/// <summary>
/// Names of all sheets in the workbook.
/// </summary>
[JsonPropertyName("sheet_names")]
public List<string>? SheetNames { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExcelMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExcelMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExcelMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExcelMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExcelMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,97 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Single Excel worksheet.
///
/// Represents one sheet from an Excel workbook with its content
/// converted to Markdown format and dimensional statistics.
/// </summary>
public sealed record ExcelSheet
{
/// <summary>
/// Sheet name as it appears in Excel
/// </summary>
[JsonPropertyName("name")]
public required string Name { get; init; }
/// <summary>
/// Sheet content converted to Markdown tables
/// </summary>
[JsonPropertyName("markdown")]
public required string Markdown { get; init; }
/// <summary>
/// Number of rows
/// </summary>
[JsonPropertyName("row_count")]
public ulong RowCount { get; init; } = 0;
/// <summary>
/// Number of columns
/// </summary>
[JsonPropertyName("col_count")]
public ulong ColCount { get; init; } = 0;
/// <summary>
/// Total number of non-empty cells
/// </summary>
[JsonPropertyName("cell_count")]
public ulong CellCount { get; init; } = 0;
/// <summary>
/// Pre-extracted table cells (2D vector of cell values)
/// Populated during markdown generation to avoid re-parsing markdown.
/// null for empty sheets.
/// </summary>
[JsonPropertyName("table_cells")]
public List<List<string>>? TableCells { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExcelSheet"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExcelSheet FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExcelSheet>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExcelSheet from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExcelSheet from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Excel workbook representation.
///
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
/// extracted content and metadata.
/// </summary>
public sealed record ExcelWorkbook
{
/// <summary>
/// All sheets in the workbook
/// </summary>
[JsonPropertyName("sheets")]
public List<ExcelSheet> Sheets { get; init; } = [];
/// <summary>
/// Workbook-level metadata (author, creation date, etc.)
/// </summary>
[JsonPropertyName("metadata")]
public Dictionary<string, string> Metadata { get; init; } = default!;
/// <summary>
/// Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
///
/// Populated for legacy shared-workbook `.xlsx` files that contain the
/// `xl/revisions/` directory. Each `&lt;header&gt;` element maps to one
/// `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
/// (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
/// `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
/// follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
/// </summary>
[JsonPropertyName("revisions")]
public List<DocumentRevision>? Revisions { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExcelWorkbook"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExcelWorkbook FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExcelWorkbook>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExcelWorkbook from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExcelWorkbook from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,79 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// ONNX Runtime execution provider type.
///
/// Determines which hardware backend is used for model inference.
/// `Auto` (default) selects the best available provider per platform.
/// </summary>
[JsonConverter(typeof(ExecutionProviderTypeJsonConverter))]
public enum ExecutionProviderType
{
/// <summary>
/// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
/// </summary>
[JsonPropertyName("auto")]
Auto,
/// <summary>
/// CPU execution provider (always available).
/// </summary>
[JsonPropertyName("cpu")]
Cpu,
/// <summary>
/// Apple CoreML (macOS/iOS Neural Engine + GPU).
/// </summary>
[JsonPropertyName("coreml")]
CoreMl,
/// <summary>
/// NVIDIA CUDA GPU acceleration.
/// </summary>
[JsonPropertyName("cuda")]
Cuda,
/// <summary>
/// NVIDIA TensorRT (optimized CUDA inference).
/// </summary>
[JsonPropertyName("tensorrt")]
TensorRt,
}
/// <summary>
/// Custom JSON converter for <see cref="ExecutionProviderType"/> that respects explicit variant names.
/// </summary>
internal sealed class ExecutionProviderTypeJsonConverter : JsonConverter<ExecutionProviderType>
{
public override ExecutionProviderType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"auto" => ExecutionProviderType.Auto,
"cpu" => ExecutionProviderType.Cpu,
"coreml" => ExecutionProviderType.CoreMl,
"cuda" => ExecutionProviderType.Cuda,
"tensorrt" => ExecutionProviderType.TensorRt,
_ => throw new JsonException($"Unknown ExecutionProviderType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ExecutionProviderType value, JsonSerializerOptions options)
{
var str = value switch
{
ExecutionProviderType.Auto => "auto",
ExecutionProviderType.Cpu => "cpu",
ExecutionProviderType.CoreMl => "coreml",
ExecutionProviderType.Cuda => "cuda",
ExecutionProviderType.TensorRt => "tensorrt",
_ => throw new JsonException($"Unknown ExecutionProviderType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,166 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Extracted image from a document.
///
/// Contains raw image data, metadata, and optional nested OCR results.
/// Raw bytes allow cross-language compatibility - users can convert to
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
/// </summary>
public sealed record ExtractedImage
{
/// <summary>
/// Raw image data (PNG, JPEG, WebP, etc. bytes).
/// Uses `bytes.Bytes` for cheap cloning of large buffers.
/// </summary>
[JsonConverter(typeof(ByteArrayToIntArrayConverter))]
[JsonPropertyName("data")]
public byte[] Data { get; init; } = [];
/// <summary>
/// Image format (e.g., "jpeg", "png", "webp")
/// Uses Cow&lt;, str&gt; to avoid allocation for static literals.
/// </summary>
[JsonPropertyName("format")]
public required string Format { get; init; }
/// <summary>
/// Zero-indexed position of this image in the document/page
/// </summary>
[JsonPropertyName("image_index")]
public uint ImageIndex { get; init; } = 0;
/// <summary>
/// Page/slide number where image was found (1-indexed)
/// </summary>
[JsonPropertyName("page_number")]
public uint? PageNumber { get; init; } = null;
/// <summary>
/// Image width in pixels
/// </summary>
[JsonPropertyName("width")]
public uint? Width { get; init; } = null;
/// <summary>
/// Image height in pixels
/// </summary>
[JsonPropertyName("height")]
public uint? Height { get; init; } = null;
/// <summary>
/// Colorspace information (e.g., "RGB", "CMYK", "Gray")
/// </summary>
[JsonPropertyName("colorspace")]
public string? Colorspace { get; init; } = null;
/// <summary>
/// Bits per color component (e.g., 8, 16)
/// </summary>
[JsonPropertyName("bits_per_component")]
public uint? BitsPerComponent { get; init; } = null;
/// <summary>
/// Whether this image is a mask image
/// </summary>
[JsonPropertyName("is_mask")]
public bool IsMask { get; init; } = false;
/// <summary>
/// Optional description of the image
/// </summary>
[JsonPropertyName("description")]
public string? Description { get; init; } = null;
/// <summary>
/// Nested OCR extraction result (if image was OCRed)
///
/// When OCR is performed on this image, the result is embedded here
/// rather than in a separate collection, making the relationship explicit.
/// </summary>
[JsonPropertyName("ocr_result")]
public ExtractionResult? OcrResult { get; init; } = null;
/// <summary>
/// Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
/// Only populated for PDF-extracted images when position data is available from the PDF extractor.
/// </summary>
[JsonPropertyName("bounding_box")]
public BoundingBox? BoundingBox { get; init; } = null;
/// <summary>
/// Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
/// Used for rendering image references when the binary data is not extracted.
/// </summary>
[JsonPropertyName("source_path")]
public string? SourcePath { get; init; } = null;
/// <summary>
/// Heuristic classification of what this image likely depicts.
/// `None` if classification was disabled or inconclusive.
/// </summary>
[JsonPropertyName("image_kind")]
public ImageKind? ImageKind { get; init; } = null;
/// <summary>
/// Confidence score for `image_kind`, in the range 0.0 to 1.0.
/// </summary>
[JsonPropertyName("kind_confidence")]
public float? KindConfidence { get; init; } = null;
/// <summary>
/// Identifier shared across images that form a single logical figure
/// (e.g. all raster tiles of one technical drawing). `None` for singletons.
/// </summary>
[JsonPropertyName("cluster_id")]
public uint? ClusterId { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExtractedImage"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractedImage FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractedImage>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractedImage from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractedImage from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,84 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A URI extracted from a document.
///
/// Represents any link, reference, or resource pointer found during extraction.
/// The `kind` field classifies the URI semantically, while `label` carries
/// optional human-readable display text.
/// </summary>
public sealed record ExtractedUri
{
/// <summary>
/// The URL or path string.
/// </summary>
[JsonPropertyName("url")]
public required string Url { get; init; }
/// <summary>
/// Optional display text / label for the link.
/// </summary>
[JsonPropertyName("label")]
public string? Label { get; init; } = null;
/// <summary>
/// Optional page number where the URI was found (1-indexed).
/// </summary>
[JsonPropertyName("page")]
public uint? Page { get; init; } = null;
/// <summary>
/// Semantic classification of the URI.
/// </summary>
[JsonPropertyName("kind")]
public required UriKind Kind { get; init; }
/// <summary>
/// Parse a <see cref="ExtractedUri"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractedUri FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractedUri>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractedUri from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractedUri from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,405 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Main extraction configuration.
///
/// This struct contains all configuration options for the extraction process.
/// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
/// </summary>
public sealed record ExtractionConfig
{
/// <summary>
/// Enable caching of extraction results
/// </summary>
[JsonPropertyName("use_cache")]
public bool UseCache { get; init; } = true;
/// <summary>
/// Enable quality post-processing
/// </summary>
[JsonPropertyName("enable_quality_processing")]
public bool EnableQualityProcessing { get; init; } = true;
/// <summary>
/// OCR configuration (null = OCR disabled)
/// </summary>
[JsonPropertyName("ocr")]
public OcrConfig? Ocr { get; init; } = null;
/// <summary>
/// Force OCR even for searchable PDFs
/// </summary>
[JsonPropertyName("force_ocr")]
public bool ForceOcr { get; init; } = false;
/// <summary>
/// Force OCR on specific pages only (1-indexed page numbers, must be &gt;= 1).
///
/// When set, only the listed pages are OCR'd regardless of text layer quality.
/// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
/// Only applies to PDF documents. Duplicates are automatically deduplicated.
/// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
/// </summary>
[JsonPropertyName("force_ocr_pages")]
public List<uint>? ForceOcrPages { get; init; } = null;
/// <summary>
/// Disable OCR entirely, even for images.
///
/// When `true`, OCR is skipped for all document types. Images return metadata
/// only (dimensions, format, EXIF) without text extraction. PDFs use only
/// native text extraction without OCR fallback.
///
/// Cannot be `true` simultaneously with `force_ocr`.
///
/// *Added in v4.7.0.*
/// </summary>
[JsonPropertyName("disable_ocr")]
public bool DisableOcr { get; init; } = false;
/// <summary>
/// Text chunking configuration (null = chunking disabled)
/// </summary>
[JsonPropertyName("chunking")]
public ChunkingConfig? Chunking { get; init; } = null;
/// <summary>
/// Content filtering configuration (null = use extractor defaults).
///
/// Controls whether document "furniture" (headers, footers, watermarks,
/// repeating text) is included in or stripped from extraction results.
/// See `ContentFilterConfig` for per-field documentation.
/// </summary>
[JsonPropertyName("content_filter")]
public ContentFilterConfig? ContentFilter { get; init; } = null;
/// <summary>
/// Image extraction configuration (null = no image extraction)
/// </summary>
[JsonPropertyName("images")]
public ImageExtractionConfig? Images { get; init; } = null;
/// <summary>
/// PDF-specific options (null = use defaults)
/// </summary>
[JsonPropertyName("pdf_options")]
public PdfConfig? PdfOptions { get; init; } = null;
/// <summary>
/// Token reduction configuration (null = no token reduction)
/// </summary>
[JsonPropertyName("token_reduction")]
public TokenReductionOptions? TokenReduction { get; init; } = null;
/// <summary>
/// Language detection configuration (null = no language detection)
/// </summary>
[JsonPropertyName("language_detection")]
public LanguageDetectionConfig? LanguageDetection { get; init; } = null;
/// <summary>
/// Page extraction configuration (null = no page tracking)
/// </summary>
[JsonPropertyName("pages")]
public PageConfig? Pages { get; init; } = null;
/// <summary>
/// Keyword extraction configuration (null = no keyword extraction)
/// </summary>
[JsonPropertyName("keywords")]
public KeywordConfig? Keywords { get; init; } = null;
/// <summary>
/// Post-processor configuration (null = use defaults)
/// </summary>
[JsonPropertyName("postprocessor")]
public PostProcessorConfig? Postprocessor { get; init; } = null;
/// <summary>
/// HTML to Markdown conversion options (null = use defaults)
///
/// Configure how HTML documents are converted to Markdown, including heading styles,
/// list formatting, code block styles, and preprocessing options.
/// </summary>
[JsonPropertyName("html_options")]
public string? HtmlOptions { get; init; } = null;
/// <summary>
/// Styled HTML output configuration.
///
/// When set alongside `output_format = OutputFormat.Html`, the extraction
/// pipeline uses `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer)
/// which emits stable `kb-*` CSS class hooks on every structural element
/// and optionally embeds theme CSS or user-supplied CSS in a `&lt;style&gt;` block.
///
/// When `None`, the existing plain comrak-based HTML renderer is used.
/// </summary>
[JsonPropertyName("html_output")]
public HtmlOutputConfig? HtmlOutput { get; init; } = null;
/// <summary>
/// Default per-file timeout in seconds for batch extraction.
///
/// When set, each file in a batch will be canceled after this duration
/// unless overridden by `FileExtractionConfig.timeout_secs`.
///
/// Defaults to `Some(60)` to prevent pathological files (e.g. deeply
/// nested archives, documents with millions of cells) from running
/// indefinitely and exhausting caller resources. Set to `None` to
/// disable the timeout for trusted input or long-running workloads.
/// </summary>
[JsonPropertyName("extraction_timeout_secs")]
public ulong? ExtractionTimeoutSecs { get; init; } = null;
/// <summary>
/// Maximum concurrent extractions in batch operations (null = (num_cpus × 1.5).ceil()).
///
/// Limits parallelism to prevent resource exhaustion when processing
/// large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
/// </summary>
[JsonPropertyName("max_concurrent_extractions")]
public ulong? MaxConcurrentExtractions { get; init; } = null;
/// <summary>
/// Result structure format
///
/// Controls whether results are returned in unified format (default) with all
/// content in the `content` field, or element-based format with semantic
/// elements (for Unstructured-compatible output).
/// </summary>
[JsonPropertyName("result_format")]
public ResultFormat? ResultFormat { get; init; } = null;
/// <summary>
/// Security limits for archive extraction.
///
/// Controls maximum archive size, compression ratio, file count, and other
/// security thresholds to prevent decompression bomb attacks. Also caps
/// nesting depth, iteration count, entity / token length, total
/// content size, and table cell count for every extraction path that
/// ingests user-controlled bytes.
/// When `None`, default limits are used.
/// </summary>
[JsonPropertyName("security_limits")]
public SecurityLimits? SecurityLimits { get; init; } = null;
/// <summary>
/// Maximum uncompressed size in bytes for a single embedded file before
/// recursive extraction is attempted (default: 50 MiB).
///
/// Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
/// to email attachments processed via recursive extraction. Files that
/// exceed this limit are skipped with a `ProcessingWarning` rather than
/// passed to the extraction pipeline, preventing a single oversized
/// embedded object from consuming unbounded memory or time.
///
/// Set to `None` to disable the per-embedded-file cap (falls back to
/// `security_limits.max_archive_size` as the only guard).
/// </summary>
[JsonPropertyName("max_embedded_file_bytes")]
public ulong? MaxEmbeddedFileBytes { get; init; } = null;
/// <summary>
/// Content text format (default: Plain).
///
/// Controls the format of the extracted content:
/// - `Plain`: Raw extracted text (default)
/// - `Markdown`: Markdown formatted output
/// - `Djot`: Djot markup format (requires djot feature)
/// - `Html`: HTML formatted output
///
/// When set to a structured format, extraction results will include
/// formatted output. The `formatted_content` field may be populated
/// when format conversion is applied.
/// </summary>
[JsonPropertyName("output_format")]
public OutputFormat OutputFormat { get; init; } = OutputFormat.Plain;
/// <summary>
/// Layout detection configuration (null = layout detection disabled).
///
/// When set, PDF pages and images are analyzed for document structure
/// (headings, code, formulas, tables, figures, etc.) using RT-DETR models
/// via ONNX Runtime. For PDFs, layout hints override paragraph classification
/// in the markdown pipeline. For images, per-region OCR is performed with
/// markdown formatting based on detected layout classes.
/// Requires the `layout-detection` feature to run inference; the field is
/// present whenever the `layout-types` feature is active (which includes
/// `layout-detection` as well as the no-ORT target groups).
/// </summary>
[JsonPropertyName("layout")]
public LayoutDetectionConfig? Layout { get; init; } = null;
/// <summary>
/// Run layout detection on the non-OCR PDF markdown path.
///
/// When `true` and `layout` is `Some(_)`, layout regions inform heading,
/// table, list, and figure detection in the structure pipeline that would
/// otherwise rely on font-clustering heuristics alone. Significantly
/// improves SF1 (structural F1) at the cost of inference latency
/// (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
/// Requires the `layout-detection` feature.
/// </summary>
[JsonPropertyName("use_layout_for_markdown")]
public bool UseLayoutForMarkdown { get; init; } = false;
/// <summary>
/// Enable structured document tree output.
///
/// When true, populates the `document` field on `ExtractionResult` with a
/// hierarchical `DocumentStructure` containing heading-driven section nesting,
/// table grids, content layer classification, and inline annotations.
///
/// Independent of `result_format` — can be combined with Unified or ElementBased.
/// </summary>
[JsonPropertyName("include_document_structure")]
public bool IncludeDocumentStructure { get; init; } = false;
/// <summary>
/// Hardware acceleration configuration for ONNX Runtime models.
///
/// Controls execution provider selection for layout detection and embedding
/// models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
/// Linux, CPU on Windows).
/// </summary>
[JsonPropertyName("acceleration")]
public AccelerationConfig? Acceleration { get; init; } = null;
/// <summary>
/// Cache namespace for tenant isolation.
///
/// When set, cache entries are stored under `{cache_dir}/{namespace}/`.
/// Must be alphanumeric, hyphens, or underscores only (max 64 chars).
/// Different namespaces have isolated cache spaces on the same filesystem.
/// </summary>
[JsonPropertyName("cache_namespace")]
public string? CacheNamespace { get; init; } = null;
/// <summary>
/// Per-request cache TTL in seconds.
///
/// Overrides the global `max_age_days` for this specific extraction.
/// When `0`, caching is completely skipped (no read or write).
/// When `None`, the global TTL applies.
/// </summary>
[JsonPropertyName("cache_ttl_secs")]
public ulong? CacheTtlSecs { get; init; } = null;
/// <summary>
/// Email extraction configuration (null = use defaults).
///
/// Currently supports configuring the fallback codepage for MSG files
/// that do not specify one. See `EmailConfig` for details.
/// </summary>
[JsonPropertyName("email")]
public EmailConfig? Email { get; init; } = null;
/// <summary>
/// Concurrency limits for constrained environments (null = use defaults).
///
/// Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
/// (when `max_concurrent_extractions` is unset) the batch concurrency
/// semaphore. See `ConcurrencyConfig` for details.
/// </summary>
[JsonPropertyName("concurrency")]
public string? Concurrency { get; init; } = null;
/// <summary>
/// Maximum recursion depth for archive extraction (default: 3).
/// Set to 0 to disable recursive extraction (legacy behavior).
/// </summary>
[JsonPropertyName("max_archive_depth")]
public ulong MaxArchiveDepth { get; init; } = 0;
/// <summary>
/// Tree-sitter language pack configuration (null = tree-sitter disabled).
///
/// When set, enables code file extraction using tree-sitter parsers.
/// Controls grammar download behavior and code analysis options.
/// </summary>
[JsonPropertyName("tree_sitter")]
public TreeSitterConfig? TreeSitter { get; init; } = null;
/// <summary>
/// Structured extraction via LLM (null = disabled).
///
/// When set, the extracted document content is sent to an LLM with the
/// provided JSON schema. The structured response is stored in
/// `ExtractionResult.structured_output`.
/// </summary>
[JsonPropertyName("structured_extraction")]
public StructuredExtractionConfig? StructuredExtraction { get; init; } = null;
/// <summary>
/// Cancellation token for this extraction (null = no external cancellation).
///
/// Pass a `CancellationToken` clone here and call `CancellationToken.cancel`
/// from another thread / task to abort the extraction in progress. The extractor
/// checks the token at safe checkpoints (before lock acquisition, between pages,
/// between batch items) and returns `KreuzbergError.Cancelled` when set.
///
/// The field is excluded from serialization because `CancellationToken` is a
/// runtime handle, not a configuration value.
/// </summary>
[JsonPropertyName("cancel_token")]
public string? CancelToken { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExtractionConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractionConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractionConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractionConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractionConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ExtractionConfig Default()
{
var nativeResult = NativeMethods.ExtractionConfigDefault();
var jsonPtr = NativeMethods.ExtractionConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ExtractionConfigFree(nativeResult);
return JsonSerializer.Deserialize<ExtractionConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,102 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// The complete diff between two `ExtractionResult` values.
/// </summary>
public sealed record ExtractionDiff
{
/// <summary>
/// Unified-diff hunks for the `content` field.
///
/// Empty when the content is identical.
/// </summary>
[JsonPropertyName("content_diff")]
public List<DiffHunk> ContentDiff { get; init; } = [];
/// <summary>
/// Tables present in `b` but not in `a` (by index position, excess right-side tables).
/// </summary>
[JsonPropertyName("tables_added")]
public List<Table> TablesAdded { get; init; } = [];
/// <summary>
/// Tables present in `a` but not in `b` (by index position, excess left-side tables).
/// </summary>
[JsonPropertyName("tables_removed")]
public List<Table> TablesRemoved { get; init; } = [];
/// <summary>
/// Cell-level changes for table pairs that share the same index and dimensions.
/// </summary>
[JsonPropertyName("tables_changed")]
public List<TableDiff> TablesChanged { get; init; } = [];
/// <summary>
/// Metadata difference, encoded as a JSON object with three top-level keys:
/// `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
/// but not `b`), and `changed` (keys whose values differ — each entry is
/// `{ "from": &lt;value-in-a&gt;, "to": &lt;value-in-b&gt; }`).
///
/// This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
/// to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
/// (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
/// preferred json-patch impl directly.
/// </summary>
[JsonPropertyName("metadata_changed")]
public required string MetadataChanged { get; init; }
/// <summary>
/// Changes to embedded archive children.
/// </summary>
[JsonPropertyName("embedded_changes")]
public required EmbeddedChanges EmbeddedChanges { get; init; }
/// <summary>
/// Parse a <see cref="ExtractionDiff"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractionDiff FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractionDiff>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractionDiff from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractionDiff from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,53 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// How the extracted text was produced.
/// </summary>
[JsonConverter(typeof(ExtractionMethodJsonConverter))]
public enum ExtractionMethod
{
[JsonPropertyName("native")]
Native,
[JsonPropertyName("ocr")]
Ocr,
[JsonPropertyName("mixed")]
Mixed,
}
/// <summary>
/// Custom JSON converter for <see cref="ExtractionMethod"/> that respects explicit variant names.
/// </summary>
internal sealed class ExtractionMethodJsonConverter : JsonConverter<ExtractionMethod>
{
public override ExtractionMethod Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"native" => ExtractionMethod.Native,
"ocr" => ExtractionMethod.Ocr,
"mixed" => ExtractionMethod.Mixed,
_ => throw new JsonException($"Unknown ExtractionMethod value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ExtractionMethod value, JsonSerializerOptions options)
{
var str = value switch
{
ExtractionMethod.Native => "native",
ExtractionMethod.Ocr => "ocr",
ExtractionMethod.Mixed => "mixed",
_ => throw new JsonException($"Unknown ExtractionMethod value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,332 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// General extraction result used by the core extraction API.
///
/// This is the main result type returned by all extraction functions.
/// </summary>
public sealed record ExtractionResult
{
[JsonPropertyName("content")]
public string Content { get; init; } = "";
[JsonPropertyName("mime_type")]
public string MimeType { get; init; } = "";
[JsonPropertyName("metadata")]
public Metadata Metadata { get; init; } = default!;
/// <summary>
/// Extraction strategy used to produce the returned text.
///
/// Populated when the extractor can reliably distinguish native text extraction,
/// OCR-only extraction, or mixed native/OCR output.
/// </summary>
[JsonPropertyName("extraction_method")]
public ExtractionMethod? ExtractionMethod { get; init; } = null;
[JsonPropertyName("tables")]
public List<Table> Tables { get; init; } = [];
[JsonPropertyName("detected_languages")]
public List<string>? DetectedLanguages { get; init; } = null;
/// <summary>
/// Text chunks when chunking is enabled.
///
/// When chunking configuration is provided, the content is split into
/// overlapping chunks for efficient processing. Each chunk contains the text,
/// optional embeddings (if enabled), and metadata about its position.
/// </summary>
[JsonPropertyName("chunks")]
public List<Chunk>? Chunks { get; init; } = null;
/// <summary>
/// Extracted images from the document.
///
/// When image extraction is enabled via `ImageExtractionConfig`, this field
/// contains all images found in the document with their raw data and metadata.
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
/// </summary>
[JsonPropertyName("images")]
public List<ExtractedImage>? Images { get; init; } = null;
/// <summary>
/// Per-page content when page extraction is enabled.
///
/// When page extraction is configured, the document is split into per-page content
/// with tables and images mapped to their respective pages.
/// </summary>
[JsonPropertyName("pages")]
public List<PageContent>? Pages { get; init; } = null;
/// <summary>
/// Semantic elements when element-based result format is enabled.
///
/// When result_format is set to ElementBased, this field contains semantic
/// elements with type classification, unique identifiers, and metadata for
/// Unstructured-compatible element-based processing.
/// </summary>
[JsonPropertyName("elements")]
public List<Element>? Elements { get; init; } = null;
/// <summary>
/// Rich Djot content structure (when extracting Djot documents).
///
/// When extracting Djot documents with structured extraction enabled,
/// this field contains the full semantic structure including:
/// - Block-level elements with nesting
/// - Inline formatting with attributes
/// - Links, images, footnotes
/// - Math expressions
/// - Complete attribute information
///
/// The `content` field still contains plain text for backward compatibility.
///
/// Always `None` for non-Djot documents.
/// </summary>
[JsonPropertyName("djot_content")]
public DjotContent? DjotContent { get; init; } = null;
/// <summary>
/// OCR elements with full spatial and confidence metadata.
///
/// When OCR is performed with element extraction enabled, this field contains
/// the structured representation of detected text including:
/// - Bounding geometry (rectangles or quadrilaterals)
/// - Confidence scores (detection and recognition)
/// - Rotation information
/// - Hierarchical relationships (Tesseract only)
///
/// This field preserves all metadata that would otherwise be lost when
/// converting to plain text or markdown output formats.
///
/// Only populated when `OcrElementConfig.include_elements` is true.
/// </summary>
[JsonPropertyName("ocr_elements")]
public List<OcrElement>? OcrElements { get; init; } = null;
/// <summary>
/// Structured document tree (when document structure extraction is enabled).
///
/// When `include_document_structure` is true in `ExtractionConfig`, this field
/// contains the full hierarchical representation of the document including:
/// - Heading-driven section nesting
/// - Table grids with cell-level metadata
/// - Content layer classification (body, header, footer, footnote)
/// - Inline text annotations (formatting, links)
/// - Bounding boxes and page numbers
///
/// Independent of `result_format` — can be combined with Unified or ElementBased.
/// </summary>
[JsonPropertyName("document")]
public DocumentStructure? Document { get; init; } = null;
/// <summary>
/// Extracted keywords when keyword extraction is enabled.
///
/// When keyword extraction (RAKE or YAKE) is configured, this field contains
/// the extracted keywords with scores, algorithm info, and position data.
/// Previously stored in `metadata.additional["keywords"]`.
/// </summary>
[JsonPropertyName("extracted_keywords")]
public List<Keyword>? ExtractedKeywords { get; init; } = null;
/// <summary>
/// Document quality score from quality analysis.
///
/// A value between 0.0 and 1.0 indicating the overall text quality.
/// Previously stored in `metadata.additional["quality_score"]`.
/// </summary>
[JsonPropertyName("quality_score")]
public double? QualityScore { get; init; } = null;
/// <summary>
/// Non-fatal warnings collected during processing pipeline stages.
///
/// Captures errors from optional pipeline features (embedding, chunking,
/// language detection, output formatting) that don't prevent extraction
/// but may indicate degraded results.
/// Previously stored as individual keys in `metadata.additional`.
/// </summary>
[JsonPropertyName("processing_warnings")]
public List<ProcessingWarning> ProcessingWarnings { get; init; } = [];
/// <summary>
/// PDF annotations extracted from the document.
///
/// When annotation extraction is enabled via `PdfConfig.extract_annotations`,
/// this field contains text notes, highlights, links, stamps, and other
/// annotations found in PDF documents.
/// </summary>
[JsonPropertyName("annotations")]
public List<PdfAnnotation>? Annotations { get; init; } = null;
/// <summary>
/// Nested extraction results from archive contents.
///
/// When extracting archives, each processable file inside produces its own
/// full extraction result. Set to `None` for non-archive formats.
/// Use `max_archive_depth` in config to control recursion depth.
/// </summary>
[JsonPropertyName("children")]
public List<ArchiveEntry>? Children { get; init; } = null;
/// <summary>
/// URIs/links discovered during document extraction.
///
/// Contains hyperlinks, image references, citations, email addresses, and
/// other URI-like references found in the document. Always extracted when
/// present in the source document.
/// </summary>
[JsonPropertyName("uris")]
public List<ExtractedUri>? Uris { get; init; } = null;
/// <summary>
/// Tracked changes embedded in the source document.
///
/// Populated by per-format extractors that understand change-tracking
/// metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
/// …). Every extractor defaults to `None` until its format-specific
/// implementation is added. Extractors that do populate this field follow
/// the "accepted-changes" convention: inserted text is present in
/// `content`, deleted text is absent — the revision list is the separate
/// audit trail.
/// </summary>
[JsonPropertyName("revisions")]
public List<DocumentRevision>? Revisions { get; init; } = null;
/// <summary>
/// Structured extraction output from LLM-based JSON schema extraction.
///
/// When `structured_extraction` is configured in `ExtractionConfig`, the
/// extracted document content is sent to a VLM with the provided JSON schema.
/// The response is parsed and stored here as a JSON value matching the schema.
/// </summary>
[JsonPropertyName("structured_output")]
public string? StructuredOutput { get; init; } = null;
/// <summary>
/// Code intelligence results from tree-sitter analysis.
///
/// Populated when extracting source code files with the `tree-sitter` feature.
/// Contains metrics, structural analysis, imports/exports, comments,
/// docstrings, symbols, diagnostics, and optionally chunked code segments.
///
/// Stored as an opaque JSON value so that all language bindings (Go, Java,
/// C#, …) can deserialize it as a raw JSON object rather than a typed struct.
/// The underlying type is `tree_sitter_language_pack.ProcessResult`.
/// </summary>
[JsonPropertyName("code_intelligence")]
public string? CodeIntelligence { get; init; } = null;
/// <summary>
/// LLM token usage and cost data for all LLM calls made during this extraction.
///
/// Contains one entry per LLM call. Multiple entries are produced when
/// VLM OCR, structured extraction, or LLM embeddings run during
/// the same extraction.
///
/// `None` when no LLM was used.
/// </summary>
[JsonPropertyName("llm_usage")]
public List<LlmUsage>? LlmUsage { get; init; } = null;
/// <summary>
/// Pre-rendered content in the requested output format.
///
/// Populated during `derive_extraction_result` before tree derivation consumes
/// element data. `apply_output_format` swaps this into `content` at the end
/// of the pipeline, after post-processors have operated on plain text.
/// </summary>
[JsonPropertyName("formatted_content")]
public string? FormattedContent { get; init; } = null;
/// <summary>
/// Structured hOCR document for the OCR+layout pipeline.
///
/// When tesseract produces hOCR output, the parsed `InternalDocument` carries
/// paragraph structure with bounding boxes and confidence scores. The layout
/// classification step enriches these elements before final rendering.
/// </summary>
[JsonPropertyName("ocr_internal_document")]
public string? OcrInternalDocument { get; init; } = null;
/// <summary>
/// Parse a <see cref="ExtractionResult"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ExtractionResult FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ExtractionResult>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ExtractionResult from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ExtractionResult from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>
/// Convert from an OCR result.
/// </summary>
public static ExtractionResult FromOcr(OcrExtractionResult ocr)
{
var ocrJson = JsonSerializer.Serialize(ocr, JsonSerializationOptions);
var ocrHandle = NativeMethods.OcrExtractionResultFromJson(ocrJson);
if (ocrHandle == IntPtr.Zero)
{
var ec = NativeMethods.LastErrorCode();
var ctxPtr = NativeMethods.LastErrorContext();
var msg = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(ctxPtr) ?? "OcrExtractionResultFromJson failed";
throw new KreuzbergException(ec, msg);
}
try
{
var nativeResult = NativeMethods.ExtractionResultFromOcr(ocrHandle);
var jsonPtr = NativeMethods.ExtractionResultToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ExtractionResultFree(nativeResult);
return JsonSerializer.Deserialize<ExtractionResult>(json ?? "null", JsonOptions)!;
}
finally
{
if (ocrHandle != global::System.IntPtr.Zero) NativeMethods.OcrExtractionResultFree(ocrHandle);
}
}
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// FictionBook (FB2) metadata.
/// </summary>
public sealed record FictionBookMetadata
{
[JsonPropertyName("genres")]
public List<string> Genres { get; init; } = [];
[JsonPropertyName("sequences")]
public List<string> Sequences { get; init; } = [];
[JsonPropertyName("annotation")]
public string? Annotation { get; init; } = null;
/// <summary>
/// Parse a <see cref="FictionBookMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static FictionBookMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<FictionBookMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse FictionBookMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse FictionBookMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,210 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Per-file extraction configuration overrides for batch processing.
///
/// All fields are `Option&lt;T&gt;` — `None` means "use the batch-level default."
/// This type is used with `batch_extract_files` and
/// `batch_extract_bytes` to allow heterogeneous
/// extraction settings within a single batch.
///
/// # Excluded Fields
///
/// The following `ExtractionConfig` fields are batch-level only and
/// cannot be overridden per file:
/// - `max_concurrent_extractions` — controls batch parallelism
/// - `use_cache` — global caching policy
/// - `acceleration` — shared ONNX execution provider
/// - `security_limits` — global archive security policy
/// </summary>
public sealed record FileExtractionConfig
{
/// <summary>
/// Override quality post-processing for this file.
/// </summary>
[JsonPropertyName("enable_quality_processing")]
public bool? EnableQualityProcessing { get; init; } = null;
/// <summary>
/// Override OCR configuration for this file (null in the Option = use batch default).
/// </summary>
[JsonPropertyName("ocr")]
public OcrConfig? Ocr { get; init; } = null;
/// <summary>
/// Override force OCR for this file.
/// </summary>
[JsonPropertyName("force_ocr")]
public bool? ForceOcr { get; init; } = null;
/// <summary>
/// Override force OCR pages for this file (1-indexed page numbers).
/// </summary>
[JsonPropertyName("force_ocr_pages")]
public List<uint>? ForceOcrPages { get; init; } = null;
/// <summary>
/// Override disable OCR for this file.
/// </summary>
[JsonPropertyName("disable_ocr")]
public bool? DisableOcr { get; init; } = null;
/// <summary>
/// Override chunking configuration for this file.
/// </summary>
[JsonPropertyName("chunking")]
public ChunkingConfig? Chunking { get; init; } = null;
/// <summary>
/// Override content filtering configuration for this file.
/// </summary>
[JsonPropertyName("content_filter")]
public ContentFilterConfig? ContentFilter { get; init; } = null;
/// <summary>
/// Override image extraction configuration for this file.
/// </summary>
[JsonPropertyName("images")]
public ImageExtractionConfig? Images { get; init; } = null;
/// <summary>
/// Override PDF options for this file.
/// </summary>
[JsonPropertyName("pdf_options")]
public PdfConfig? PdfOptions { get; init; } = null;
/// <summary>
/// Override token reduction for this file.
/// </summary>
[JsonPropertyName("token_reduction")]
public TokenReductionOptions? TokenReduction { get; init; } = null;
/// <summary>
/// Override language detection for this file.
/// </summary>
[JsonPropertyName("language_detection")]
public LanguageDetectionConfig? LanguageDetection { get; init; } = null;
/// <summary>
/// Override page extraction for this file.
/// </summary>
[JsonPropertyName("pages")]
public PageConfig? Pages { get; init; } = null;
/// <summary>
/// Override keyword extraction for this file.
/// </summary>
[JsonPropertyName("keywords")]
public KeywordConfig? Keywords { get; init; } = null;
/// <summary>
/// Override post-processor for this file.
/// </summary>
[JsonPropertyName("postprocessor")]
public PostProcessorConfig? Postprocessor { get; init; } = null;
/// <summary>
/// Override HTML conversion options for this file.
/// </summary>
[JsonPropertyName("html_options")]
public string? HtmlOptions { get; init; } = null;
/// <summary>
/// Override result format for this file.
/// </summary>
[JsonPropertyName("result_format")]
public ResultFormat? ResultFormat { get; init; } = null;
/// <summary>
/// Override output content format for this file.
/// </summary>
[JsonPropertyName("output_format")]
public OutputFormat? OutputFormat { get; init; } = null;
/// <summary>
/// Override document structure output for this file.
/// </summary>
[JsonPropertyName("include_document_structure")]
public bool? IncludeDocumentStructure { get; init; } = null;
/// <summary>
/// Override layout detection for this file.
/// </summary>
[JsonPropertyName("layout")]
public LayoutDetectionConfig? Layout { get; init; } = null;
/// <summary>
/// Override per-file extraction timeout in seconds.
///
/// When set, the extraction for this file will be canceled after the
/// specified duration. A timed-out file produces an error result without
/// affecting other files in the batch.
/// </summary>
[JsonPropertyName("timeout_secs")]
public ulong? TimeoutSecs { get; init; } = null;
/// <summary>
/// Override tree-sitter configuration for this file.
/// </summary>
[JsonPropertyName("tree_sitter")]
public TreeSitterConfig? TreeSitter { get; init; } = null;
/// <summary>
/// Override structured extraction configuration for this file.
///
/// When set, enables LLM-based structured extraction with a JSON schema
/// for this specific file. The extracted content is sent to a VLM/LLM
/// and the response is parsed according to the provided schema.
/// </summary>
[JsonPropertyName("structured_extraction")]
public StructuredExtractionConfig? StructuredExtraction { get; init; } = null;
/// <summary>
/// Parse a <see cref="FileExtractionConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static FileExtractionConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<FileExtractionConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse FileExtractionConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse FileExtractionConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Footnote in Djot.
/// </summary>
public sealed record Footnote
{
/// <summary>
/// Footnote label
/// </summary>
[JsonPropertyName("label")]
public required string Label { get; init; }
/// <summary>
/// Footnote content blocks
/// </summary>
[JsonPropertyName("content")]
public List<FormattedBlock> Content { get; init; } = [];
/// <summary>
/// Parse a <see cref="Footnote"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static Footnote FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<Footnote>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse Footnote from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse Footnote from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,294 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Format-specific metadata (discriminated union).
///
/// Only one format type can exist per extraction result. This provides
/// type-safe, clean metadata without nested optionals.
/// </summary>
[JsonConverter(typeof(FormatMetadataJsonConverter))]
public abstract record FormatMetadata
{
public sealed record Pdf(
PdfMetadata Value
) : FormatMetadata;
public sealed record Docx(
DocxMetadata Value
) : FormatMetadata;
public sealed record Excel(
ExcelMetadata Value
) : FormatMetadata;
public sealed record Email(
EmailMetadata Value
) : FormatMetadata;
public sealed record Pptx(
PptxMetadata Value
) : FormatMetadata;
public sealed record Archive(
ArchiveMetadata Value
) : FormatMetadata;
public sealed record Image(
ImageMetadata Value
) : FormatMetadata;
public sealed record Xml(
XmlMetadata Value
) : FormatMetadata;
public sealed record Text(
TextMetadata Value
) : FormatMetadata;
public sealed record Html(
HtmlMetadata Value
) : FormatMetadata;
public sealed record Ocr(
OcrMetadata Value
) : FormatMetadata;
public sealed record Csv(
CsvMetadata Value
) : FormatMetadata;
public sealed record Bibtex(
BibtexMetadata Value
) : FormatMetadata;
public sealed record Citation(
CitationMetadata Value
) : FormatMetadata;
public sealed record FictionBook(
FictionBookMetadata Value
) : FormatMetadata;
public sealed record Dbf(
DbfMetadata Value
) : FormatMetadata;
public sealed record Jats(
JatsMetadata Value
) : FormatMetadata;
public sealed record Epub(
EpubMetadata Value
) : FormatMetadata;
public sealed record Pst(
PstMetadata Value
) : FormatMetadata;
public sealed record Code(
object Value
) : FormatMetadata;
/// <summary>Returns the Pdf data if this is a Pdf variant, otherwise null.</summary>
public PdfMetadata? AsPdf => this is Pdf e ? e.Value : null;
/// <summary>Returns the Docx data if this is a Docx variant, otherwise null.</summary>
public DocxMetadata? AsDocx => this is Docx e ? e.Value : null;
/// <summary>Returns the Excel data if this is a Excel variant, otherwise null.</summary>
public ExcelMetadata? AsExcel => this is Excel e ? e.Value : null;
/// <summary>Returns the Email data if this is a Email variant, otherwise null.</summary>
public EmailMetadata? AsEmail => this is Email e ? e.Value : null;
/// <summary>Returns the Pptx data if this is a Pptx variant, otherwise null.</summary>
public PptxMetadata? AsPptx => this is Pptx e ? e.Value : null;
/// <summary>Returns the Archive data if this is a Archive variant, otherwise null.</summary>
public ArchiveMetadata? AsArchive => this is Archive e ? e.Value : null;
/// <summary>Returns the Image data if this is a Image variant, otherwise null.</summary>
public ImageMetadata? AsImage => this is Image e ? e.Value : null;
/// <summary>Returns the Xml data if this is a Xml variant, otherwise null.</summary>
public XmlMetadata? AsXml => this is Xml e ? e.Value : null;
/// <summary>Returns the Text data if this is a Text variant, otherwise null.</summary>
public TextMetadata? AsText => this is Text e ? e.Value : null;
/// <summary>Returns the Html data if this is a Html variant, otherwise null.</summary>
public HtmlMetadata? AsHtml => this is Html e ? e.Value : null;
/// <summary>Returns the Ocr data if this is a Ocr variant, otherwise null.</summary>
public OcrMetadata? AsOcr => this is Ocr e ? e.Value : null;
/// <summary>Returns the Csv data if this is a Csv variant, otherwise null.</summary>
public CsvMetadata? AsCsv => this is Csv e ? e.Value : null;
/// <summary>Returns the Bibtex data if this is a Bibtex variant, otherwise null.</summary>
public BibtexMetadata? AsBibtex => this is Bibtex e ? e.Value : null;
/// <summary>Returns the Citation data if this is a Citation variant, otherwise null.</summary>
public CitationMetadata? AsCitation => this is Citation e ? e.Value : null;
/// <summary>Returns the FictionBook data if this is a FictionBook variant, otherwise null.</summary>
public FictionBookMetadata? AsFictionBook => this is FictionBook e ? e.Value : null;
/// <summary>Returns the Dbf data if this is a Dbf variant, otherwise null.</summary>
public DbfMetadata? AsDbf => this is Dbf e ? e.Value : null;
/// <summary>Returns the Jats data if this is a Jats variant, otherwise null.</summary>
public JatsMetadata? AsJats => this is Jats e ? e.Value : null;
/// <summary>Returns the Epub data if this is a Epub variant, otherwise null.</summary>
public EpubMetadata? AsEpub => this is Epub e ? e.Value : null;
/// <summary>Returns the Pst data if this is a Pst variant, otherwise null.</summary>
public PstMetadata? AsPst => this is Pst e ? e.Value : null;
/// <summary>Returns the Code data if this is a Code variant, otherwise null.</summary>
public object? AsCode => this is Code e ? e.Value : null;
}
/// <summary>
/// Custom converter for FormatMetadata sealed union with flattened variant fields.
/// </summary>
/// <remarks>
/// Handles JSON objects with a discriminator field (format_type) and variant-specific
/// fields at the same level. System.Text.Json's [JsonPolymorphic] cannot handle
/// this layout, so we manually deserialize here.
/// </remarks>
public sealed class FormatMetadataJsonConverter : JsonConverter<FormatMetadata>
{
public override FormatMetadata Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
if (reader.TokenType != JsonTokenType.StartObject)
{
throw new JsonException($"Expected JSON object, got {reader.TokenType}");
}
using var doc = JsonDocument.ParseValue(ref reader);
var root = doc.RootElement;
if (!root.TryGetProperty("format_type", out var tagElement))
{
throw new JsonException($"Missing discriminator field: format_type");
}
var tagValue = tagElement.GetString();
if (tagValue == null)
{
throw new JsonException("Discriminator field is null");
}
// Tuple-variant records (`Variant(InnerStruct value)`) expect a single
// "Value" field holding the inner struct's JSON, so wrap the remaining
// fields under "Value". Struct-variant records (`Variant { field1,
// field2 }`) have positional record components annotated with
// [JsonPropertyName(...)] for each named field, so pass the remaining
// fields through directly without the wrap.
using var ms = new MemoryStream();
using var writer = new Utf8JsonWriter(ms);
writer.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "format_type")
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
writer.WriteEndObject();
writer.Flush();
ms.Position = 0;
var flatJson = ms.ToArray();
using var msWrapped = new MemoryStream();
using var writerWrapped = new Utf8JsonWriter(msWrapped);
writerWrapped.WriteStartObject();
writerWrapped.WritePropertyName("Value");
writerWrapped.WriteStartObject();
foreach (var prop in root.EnumerateObject())
{
if (prop.Name != "format_type")
{
writerWrapped.WritePropertyName(prop.Name);
prop.Value.WriteTo(writerWrapped);
}
}
writerWrapped.WriteEndObject();
writerWrapped.WriteEndObject();
writerWrapped.Flush();
msWrapped.Position = 0;
var wrappedJson = msWrapped.ToArray();
return tagValue switch
{ "pdf" => JsonSerializer.Deserialize<FormatMetadata.Pdf>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "docx" => JsonSerializer.Deserialize<FormatMetadata.Docx>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "excel" => JsonSerializer.Deserialize<FormatMetadata.Excel>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "email" => JsonSerializer.Deserialize<FormatMetadata.Email>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "pptx" => JsonSerializer.Deserialize<FormatMetadata.Pptx>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "archive" => JsonSerializer.Deserialize<FormatMetadata.Archive>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "image" => JsonSerializer.Deserialize<FormatMetadata.Image>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "xml" => JsonSerializer.Deserialize<FormatMetadata.Xml>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "text" => JsonSerializer.Deserialize<FormatMetadata.Text>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "html" => JsonSerializer.Deserialize<FormatMetadata.Html>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "ocr" => JsonSerializer.Deserialize<FormatMetadata.Ocr>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "csv" => JsonSerializer.Deserialize<FormatMetadata.Csv>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "bibtex" => JsonSerializer.Deserialize<FormatMetadata.Bibtex>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "citation" => JsonSerializer.Deserialize<FormatMetadata.Citation>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "fiction_book" => JsonSerializer.Deserialize<FormatMetadata.FictionBook>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "dbf" => JsonSerializer.Deserialize<FormatMetadata.Dbf>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "jats" => JsonSerializer.Deserialize<FormatMetadata.Jats>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "epub" => JsonSerializer.Deserialize<FormatMetadata.Epub>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "pst" => JsonSerializer.Deserialize<FormatMetadata.Pst>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), "code" => JsonSerializer.Deserialize<FormatMetadata.Code>(wrappedJson, options) ?? throw new JsonException("Failed to deserialize variant"), _ => throw new JsonException($"Unknown FormatMetadata discriminator: {tagValue}")
};
}
public override void Write(Utf8JsonWriter writer, FormatMetadata value, JsonSerializerOptions options)
{
// Emit the discriminator tag plus the inner variant's fields flattened at
// the same level — mirrors the Java sealed-union serializer pattern. Turn
// `Message.User(UserMessage value)` into `{"format_type":"user","content":...}`
// not `{"value":{...}}`. Without this, sending a chat request to FFI fails
// with "missing field format_type" inside Rust serde.
string tag;
object? inner;
switch (value)
{ case FormatMetadata.Pdf v_pdf:
tag = "pdf"; inner = v_pdf.Value; break; case FormatMetadata.Docx v_docx:
tag = "docx"; inner = v_docx.Value; break; case FormatMetadata.Excel v_excel:
tag = "excel"; inner = v_excel.Value; break; case FormatMetadata.Email v_email:
tag = "email"; inner = v_email.Value; break; case FormatMetadata.Pptx v_pptx:
tag = "pptx"; inner = v_pptx.Value; break; case FormatMetadata.Archive v_archive:
tag = "archive"; inner = v_archive.Value; break; case FormatMetadata.Image v_image:
tag = "image"; inner = v_image.Value; break; case FormatMetadata.Xml v_xml:
tag = "xml"; inner = v_xml.Value; break; case FormatMetadata.Text v_text:
tag = "text"; inner = v_text.Value; break; case FormatMetadata.Html v_html:
tag = "html"; inner = v_html.Value; break; case FormatMetadata.Ocr v_ocr:
tag = "ocr"; inner = v_ocr.Value; break; case FormatMetadata.Csv v_csv:
tag = "csv"; inner = v_csv.Value; break; case FormatMetadata.Bibtex v_bibtex:
tag = "bibtex"; inner = v_bibtex.Value; break; case FormatMetadata.Citation v_citation:
tag = "citation"; inner = v_citation.Value; break; case FormatMetadata.FictionBook v_fictionbook:
tag = "fiction_book"; inner = v_fictionbook.Value; break; case FormatMetadata.Dbf v_dbf:
tag = "dbf"; inner = v_dbf.Value; break; case FormatMetadata.Jats v_jats:
tag = "jats"; inner = v_jats.Value; break; case FormatMetadata.Epub v_epub:
tag = "epub"; inner = v_epub.Value; break; case FormatMetadata.Pst v_pst:
tag = "pst"; inner = v_pst.Value; break; case FormatMetadata.Code v_code:
tag = "code"; inner = v_code.Value; break; default:
throw new JsonException($"Unknown FormatMetadata variant: {value.GetType().Name}");
}
writer.WriteStartObject();
writer.WriteString("format_type", tag);
if (inner != null)
{
using var doc = JsonSerializer.SerializeToDocument(inner, inner.GetType(), options);
if (doc.RootElement.ValueKind == JsonValueKind.Object)
{
foreach (var prop in doc.RootElement.EnumerateObject())
{
writer.WritePropertyName(prop.Name);
prop.Value.WriteTo(writer);
}
}
}
writer.WriteEndObject();
}
}

View File

@@ -0,0 +1,100 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Block-level element in a Djot document.
///
/// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
/// </summary>
public sealed record FormattedBlock
{
/// <summary>
/// Type of block element
/// </summary>
[JsonPropertyName("block_type")]
public required BlockType BlockType { get; init; }
/// <summary>
/// Heading level (1-6) for headings, or nesting level for lists
/// </summary>
[JsonPropertyName("level")]
public ulong? Level { get; init; } = null;
/// <summary>
/// Inline content within the block
/// </summary>
[JsonPropertyName("inline_content")]
public List<InlineElement> InlineContent { get; init; } = [];
/// <summary>
/// Element attributes (classes, IDs, key-value pairs)
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Language identifier for code blocks
/// </summary>
[JsonPropertyName("language")]
public string? Language { get; init; } = null;
/// <summary>
/// Raw code content for code blocks
/// </summary>
[JsonPropertyName("code")]
public string? Code { get; init; } = null;
/// <summary>
/// Nested blocks for containers (blockquotes, list items, divs)
/// </summary>
[JsonPropertyName("children")]
public List<FormattedBlock> Children { get; init; } = [];
/// <summary>
/// Parse a <see cref="FormattedBlock"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static FormattedBlock FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<FormattedBlock>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse FormattedBlock from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse FormattedBlock from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,98 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Individual grid cell with position and span metadata.
/// </summary>
public sealed record GridCell
{
/// <summary>
/// Cell text content.
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// Zero-indexed row position.
/// </summary>
[JsonPropertyName("row")]
public uint Row { get; init; } = 0;
/// <summary>
/// Zero-indexed column position.
/// </summary>
[JsonPropertyName("col")]
public uint Col { get; init; } = 0;
/// <summary>
/// Number of rows this cell spans.
/// </summary>
[JsonPropertyName("row_span")]
public uint RowSpan { get; init; } = 0;
/// <summary>
/// Number of columns this cell spans.
/// </summary>
[JsonPropertyName("col_span")]
public uint ColSpan { get; init; } = 0;
/// <summary>
/// Whether this is a header cell.
/// </summary>
[JsonPropertyName("is_header")]
public bool IsHeader { get; init; } = false;
/// <summary>
/// Bounding box for this cell (if available).
/// </summary>
[JsonPropertyName("bbox")]
public BoundingBox? Bbox { get; init; } = null;
/// <summary>
/// Parse a <see cref="GridCell"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static GridCell FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<GridCell>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse GridCell from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse GridCell from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,86 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Header/heading element metadata.
/// </summary>
public sealed record HeaderMetadata
{
/// <summary>
/// Header level: 1 (h1) through 6 (h6)
/// </summary>
[JsonPropertyName("level")]
public byte Level { get; init; } = 0;
/// <summary>
/// Normalized text content of the header
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// HTML id attribute if present
/// </summary>
[JsonPropertyName("id")]
public string? Id { get; init; } = null;
/// <summary>
/// Document tree depth at the header element
/// </summary>
[JsonPropertyName("depth")]
public uint Depth { get; init; } = 0;
/// <summary>
/// Byte offset in original HTML document
/// </summary>
[JsonPropertyName("html_offset")]
public uint HtmlOffset { get; init; } = 0;
/// <summary>
/// Parse a <see cref="HeaderMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HeaderMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HeaderMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HeaderMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HeaderMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,65 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Heading context for a chunk within a Markdown document.
///
/// Contains the heading hierarchy from document root to this chunk's section.
/// </summary>
public sealed record HeadingContext
{
/// <summary>
/// The heading hierarchy from document root to this chunk's section.
/// Index 0 is the outermost (h1), last element is the most specific.
/// </summary>
[JsonPropertyName("headings")]
public List<HeadingLevel> Headings { get; init; } = [];
/// <summary>
/// Parse a <see cref="HeadingContext"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HeadingContext FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HeadingContext>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HeadingContext from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HeadingContext from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,68 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A single heading in the hierarchy.
/// </summary>
public sealed record HeadingLevel
{
/// <summary>
/// Heading depth (1 = h1, 2 = h2, etc.)
/// </summary>
[JsonPropertyName("level")]
public byte Level { get; init; } = 0;
/// <summary>
/// The text content of the heading.
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// Parse a <see cref="HeadingLevel"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HeadingLevel FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HeadingLevel>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HeadingLevel from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HeadingLevel from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,94 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// A text block with hierarchy level assignment.
///
/// Represents a block of text with semantic heading information extracted from
/// font size clustering and hierarchical analysis.
/// </summary>
public sealed record HierarchicalBlock
{
/// <summary>
/// The text content of this block
/// </summary>
[JsonPropertyName("text")]
public required string Text { get; init; }
/// <summary>
/// The font size of the text in this block
/// </summary>
[JsonPropertyName("font_size")]
public float FontSize { get; init; } = 0.0f;
/// <summary>
/// The hierarchy level of this block (H1-H6 or Body)
///
/// Levels correspond to HTML heading tags:
/// - "h1": Top-level heading
/// - "h2": Secondary heading
/// - "h3": Tertiary heading
/// - "h4": Quaternary heading
/// - "h5": Quinary heading
/// - "h6": Senary heading
/// - "body": Body text (no heading level)
/// </summary>
[JsonPropertyName("level")]
public required string Level { get; init; }
/// <summary>
/// Bounding box information for the block
///
/// Contains coordinates as (left, top, right, bottom) in PDF units.
/// </summary>
[JsonPropertyName("bbox")]
public List<float>? Bbox { get; init; } = null;
/// <summary>
/// Parse a <see cref="HierarchicalBlock"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HierarchicalBlock FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HierarchicalBlock>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HierarchicalBlock from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HierarchicalBlock from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,101 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Hierarchy extraction configuration for PDF text structure analysis.
///
/// Enables extraction of document hierarchy levels (H1-H6) based on font size
/// clustering and semantic analysis. When enabled, hierarchical blocks are
/// included in page content.
/// </summary>
public sealed record HierarchyConfig
{
/// <summary>
/// Enable hierarchy extraction
/// </summary>
[JsonPropertyName("enabled")]
public bool Enabled { get; init; } = true;
/// <summary>
/// Number of font size clusters to use for hierarchy levels (1-7)
///
/// Default: 6, which provides H1-H6 heading levels with body text.
/// Larger values create more fine-grained hierarchy levels.
/// </summary>
[JsonPropertyName("k_clusters")]
public ulong KClusters { get; init; } = 3;
/// <summary>
/// Include bounding box information in hierarchy blocks
/// </summary>
[JsonPropertyName("include_bbox")]
public bool IncludeBbox { get; init; } = true;
/// <summary>
/// OCR coverage threshold for smart OCR triggering (0.0-1.0)
///
/// Determines when OCR should be triggered based on text block coverage.
/// OCR is triggered when text blocks cover less than this fraction of the page.
/// Default: 0.5 (trigger OCR if less than 50% of page has text)
/// </summary>
[JsonPropertyName("ocr_coverage_threshold")]
public float? OcrCoverageThreshold { get; init; } = null;
/// <summary>
/// Parse a <see cref="HierarchyConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HierarchyConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HierarchyConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HierarchyConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HierarchyConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static HierarchyConfig Default()
{
var nativeResult = NativeMethods.HierarchyConfigDefault();
var jsonPtr = NativeMethods.HierarchyConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.HierarchyConfigFree(nativeResult);
return JsonSerializer.Deserialize<HierarchyConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,153 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// HTML metadata extracted from HTML documents.
///
/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
/// and extracted structural elements (headers, links, images, structured data).
/// </summary>
public sealed record HtmlMetadata
{
/// <summary>
/// Document title from `&lt;title&gt;` tag
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Document description from `&lt;meta name="description"&gt;` tag
/// </summary>
[JsonPropertyName("description")]
public string? Description { get; init; } = null;
/// <summary>
/// Document keywords from `&lt;meta name="keywords"&gt;` tag, split on commas
/// </summary>
[JsonPropertyName("keywords")]
public List<string> Keywords { get; init; } = [];
/// <summary>
/// Document author from `&lt;meta name="author"&gt;` tag
/// </summary>
[JsonPropertyName("author")]
public string? Author { get; init; } = null;
/// <summary>
/// Canonical URL from `&lt;link rel="canonical"&gt;` tag
/// </summary>
[JsonPropertyName("canonical_url")]
public string? CanonicalUrl { get; init; } = null;
/// <summary>
/// Base URL from `&lt;base href=""&gt;` tag for resolving relative URLs
/// </summary>
[JsonPropertyName("base_href")]
public string? BaseHref { get; init; } = null;
/// <summary>
/// Document language from `lang` attribute
/// </summary>
[JsonPropertyName("language")]
public string? Language { get; init; } = null;
/// <summary>
/// Document text direction from `dir` attribute
/// </summary>
[JsonConverter(typeof(TextDirectionJsonConverter))]
[JsonPropertyName("text_direction")]
public TextDirection? TextDirection { get; init; } = null;
/// <summary>
/// Open Graph metadata (og:* properties) for social media
/// Keys like "title", "description", "image", "url", etc.
/// </summary>
[JsonPropertyName("open_graph")]
public Dictionary<string, string> OpenGraph { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Twitter Card metadata (twitter:* properties)
/// Keys like "card", "site", "creator", "title", "description", "image", etc.
/// </summary>
[JsonPropertyName("twitter_card")]
public Dictionary<string, string> TwitterCard { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Additional meta tags not covered by specific fields
/// Keys are meta name/property attributes, values are content
/// </summary>
[JsonPropertyName("meta_tags")]
public Dictionary<string, string> MetaTags { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Extracted header elements with hierarchy
/// </summary>
[JsonPropertyName("headers")]
public List<HeaderMetadata> Headers { get; init; } = [];
/// <summary>
/// Extracted hyperlinks with type classification
/// </summary>
[JsonPropertyName("links")]
public List<LinkMetadata> Links { get; init; } = [];
/// <summary>
/// Extracted images with source and dimensions
/// </summary>
[JsonPropertyName("images")]
public List<ImageMetadataType> Images { get; init; } = [];
/// <summary>
/// Extracted structured data blocks
/// </summary>
[JsonPropertyName("structured_data")]
public List<StructuredData> StructuredData { get; init; } = [];
/// <summary>
/// Parse a <see cref="HtmlMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HtmlMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HtmlMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HtmlMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HtmlMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,110 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Configuration for styled HTML output.
///
/// When set on `ExtractionConfig.html_output` alongside
/// `output_format = OutputFormat.Html`, the pipeline builds a
/// `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer) instead of
/// the plain comrak-based renderer.
/// </summary>
public sealed record HtmlOutputConfig
{
/// <summary>
/// Inline CSS string injected into the output after the theme stylesheet.
/// Concatenated after `css_file` content when both are set.
/// </summary>
[JsonPropertyName("css")]
public string? Css { get; init; } = null;
/// <summary>
/// Path to a CSS file loaded once at renderer construction time.
/// Concatenated before `css` when both are set.
/// </summary>
[JsonPropertyName("css_file")]
public string? CssFile { get; init; } = null;
/// <summary>
/// Built-in colour/typography theme. Default: `HtmlTheme.Unstyled`.
/// </summary>
[JsonPropertyName("theme")]
public HtmlTheme Theme { get; init; } = HtmlTheme.Unstyled;
/// <summary>
/// CSS class prefix applied to every emitted class name.
///
/// Default: `"kb-"`. Change this if your host application already uses
/// classes that start with `kb-`.
/// </summary>
[JsonPropertyName("class_prefix")]
public string ClassPrefix { get; init; } = "";
/// <summary>
/// When `true` (default), write the resolved CSS into a `&lt;style&gt;` block
/// immediately after the opening `&lt;div class="{prefix}doc"&gt;`.
///
/// Set to `false` to emit only the structural markup and wire up your
/// own stylesheet targeting the `kb-*` class names.
/// </summary>
[JsonPropertyName("embed_css")]
public bool EmbedCss { get; init; } = true;
/// <summary>
/// Parse a <see cref="HtmlOutputConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static HtmlOutputConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<HtmlOutputConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse HtmlOutputConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse HtmlOutputConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static HtmlOutputConfig Default()
{
var nativeResult = NativeMethods.HtmlOutputConfigDefault();
var jsonPtr = NativeMethods.HtmlOutputConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.HtmlOutputConfigFree(nativeResult);
return JsonSerializer.Deserialize<HtmlOutputConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,79 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Built-in HTML theme selection.
/// </summary>
[JsonConverter(typeof(HtmlThemeJsonConverter))]
public enum HtmlTheme
{
/// <summary>
/// Sensible defaults: system font stack, neutral colours, readable line
/// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
/// can override individual values.
/// </summary>
[JsonPropertyName("default")]
Default,
/// <summary>
/// GitHub Markdown-inspired palette and spacing.
/// </summary>
[JsonPropertyName("github")]
GitHub,
/// <summary>
/// Dark background, light text.
/// </summary>
[JsonPropertyName("dark")]
Dark,
/// <summary>
/// Minimal light theme with generous whitespace.
/// </summary>
[JsonPropertyName("light")]
Light,
/// <summary>
/// No built-in stylesheet emitted. CSS custom properties are still defined
/// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
/// </summary>
[JsonPropertyName("unstyled")]
Unstyled,
}
/// <summary>
/// Custom JSON converter for <see cref="HtmlTheme"/> that respects explicit variant names.
/// </summary>
internal sealed class HtmlThemeJsonConverter : JsonConverter<HtmlTheme>
{
public override HtmlTheme Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"default" => HtmlTheme.Default,
"github" => HtmlTheme.GitHub,
"dark" => HtmlTheme.Dark,
"light" => HtmlTheme.Light,
"unstyled" => HtmlTheme.Unstyled,
_ => throw new JsonException($"Unknown HtmlTheme value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, HtmlTheme value, JsonSerializerOptions options)
{
var str = value switch
{
HtmlTheme.Default => "default",
HtmlTheme.GitHub => "github",
HtmlTheme.Dark => "dark",
HtmlTheme.Light => "light",
HtmlTheme.Unstyled => "unstyled",
_ => throw new JsonException($"Unknown HtmlTheme value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,172 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image extraction configuration.
/// </summary>
public sealed record ImageExtractionConfig
{
/// <summary>
/// Extract images from documents
/// </summary>
[JsonPropertyName("extract_images")]
public bool ExtractImages { get; init; } = true;
/// <summary>
/// Target DPI for image normalization
/// </summary>
[JsonPropertyName("target_dpi")]
public int TargetDpi { get; init; } = 300;
/// <summary>
/// Maximum dimension for images (width or height)
/// </summary>
[JsonPropertyName("max_image_dimension")]
public int MaxImageDimension { get; init; } = 4096;
/// <summary>
/// Whether to inject image reference placeholders into markdown output.
/// When `true` (default), image references like `![Image 1](embedded:p1_i0)`
/// are appended to the markdown. Set to `false` to extract images as data
/// without polluting the markdown output.
/// </summary>
[JsonPropertyName("inject_placeholders")]
public bool InjectPlaceholders { get; init; } = true;
/// <summary>
/// Automatically adjust DPI based on image content
/// </summary>
[JsonPropertyName("auto_adjust_dpi")]
public bool AutoAdjustDpi { get; init; } = true;
/// <summary>
/// Minimum DPI threshold
/// </summary>
[JsonPropertyName("min_dpi")]
public int MinDpi { get; init; } = 72;
/// <summary>
/// Maximum DPI threshold
/// </summary>
[JsonPropertyName("max_dpi")]
public int MaxDpi { get; init; } = 600;
/// <summary>
/// Maximum number of image objects to extract per PDF page.
///
/// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
/// can trigger extremely long or indefinite extraction times when every image
/// object on a dense page is decoded individually via the PDF extractor. Setting this
/// limit causes kreuzberg to stop collecting individual images once the count
/// per page reaches the cap and emit a warning instead.
///
/// `None` (default) means no limit — all images are extracted.
/// </summary>
[JsonPropertyName("max_images_per_page")]
public uint? MaxImagesPerPage { get; init; } = null;
/// <summary>
/// When `true` (default), extracted images are classified by kind and grouped
/// into clusters where they appear to belong to one figure.
/// </summary>
[JsonPropertyName("classify")]
public bool Classify { get; init; } = true;
/// <summary>
/// When `true`, full-page renders produced during OCR preprocessing are captured
/// and returned as `ImageKind.PageRaster` entries in `ExtractionResult.images`.
///
/// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
/// document-level OCR bypass is active (whole-document backend). When OCR is
/// enabled and this flag is set but the active backend skips per-page rendering,
/// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
///
/// Defaults to `false`. Enable when downstream consumers need page thumbnails
/// (e.g. citation previews, visual grounding).
/// </summary>
[JsonPropertyName("include_page_rasters")]
public bool IncludePageRasters { get; init; } = false;
/// <summary>
/// Run OCR on extracted images and include the recognized text in the document content.
///
/// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
/// are processed with the configured OCR backend. Set to `false` to extract images
/// without OCR processing, even when OCR is enabled.
/// </summary>
[JsonPropertyName("run_ocr_on_images")]
public bool RunOcrOnImages { get; init; } = true;
/// <summary>
/// When `true`, image OCR results are rendered as plain text without the
/// `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
/// is also `true`.
/// </summary>
[JsonPropertyName("ocr_text_only")]
public bool OcrTextOnly { get; init; } = false;
/// <summary>
/// When `true` and `ocr_text_only` is `false`, append the OCR text after
/// the image placeholder in the rendered output.
/// </summary>
[JsonPropertyName("append_ocr_text")]
public bool AppendOcrText { get; init; } = false;
/// <summary>
/// Parse a <see cref="ImageExtractionConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImageExtractionConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImageExtractionConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImageExtractionConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImageExtractionConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ImageExtractionConfig Default()
{
var nativeResult = NativeMethods.ImageExtractionConfigDefault();
var jsonPtr = NativeMethods.ImageExtractionConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ImageExtractionConfigFree(nativeResult);
return JsonSerializer.Deserialize<ImageExtractionConfig>(json ?? "null", JsonOptions)!;
}
}

125
packages/csharp/src/Kreuzberg/ImageKind.cs generated Normal file
View File

@@ -0,0 +1,125 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Heuristic classification of what an image likely depicts.
/// </summary>
[JsonConverter(typeof(ImageKindJsonConverter))]
public enum ImageKind
{
/// <summary>
/// Photographic image (natural scene, photograph)
/// </summary>
[JsonPropertyName("photograph")]
Photograph,
/// <summary>
/// Technical or schematic diagram
/// </summary>
[JsonPropertyName("diagram")]
Diagram,
/// <summary>
/// Chart, graph, or plot
/// </summary>
[JsonPropertyName("chart")]
Chart,
/// <summary>
/// Freehand or technical drawing
/// </summary>
[JsonPropertyName("drawing")]
Drawing,
/// <summary>
/// Text-heavy image (scanned text, document)
/// </summary>
[JsonPropertyName("text_block")]
TextBlock,
/// <summary>
/// Decorative element or border
/// </summary>
[JsonPropertyName("decoration")]
Decoration,
/// <summary>
/// Logo or brand mark
/// </summary>
[JsonPropertyName("logo")]
Logo,
/// <summary>
/// Small icon
/// </summary>
[JsonPropertyName("icon")]
Icon,
/// <summary>
/// Fragment of a larger tiled image (tile of a technical drawing)
/// </summary>
[JsonPropertyName("tile_fragment")]
TileFragment,
/// <summary>
/// Mask or transparency map
/// </summary>
[JsonPropertyName("mask")]
Mask,
/// <summary>
/// Full-page render produced during OCR preprocessing; used as a citation thumbnail.
/// </summary>
[JsonPropertyName("page_raster")]
PageRaster,
/// <summary>
/// Could not classify with reasonable confidence
/// </summary>
[JsonPropertyName("unknown")]
Unknown,
}
/// <summary>
/// Custom JSON converter for <see cref="ImageKind"/> that respects explicit variant names.
/// </summary>
internal sealed class ImageKindJsonConverter : JsonConverter<ImageKind>
{
public override ImageKind Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"photograph" => ImageKind.Photograph,
"diagram" => ImageKind.Diagram,
"chart" => ImageKind.Chart,
"drawing" => ImageKind.Drawing,
"text_block" => ImageKind.TextBlock,
"decoration" => ImageKind.Decoration,
"logo" => ImageKind.Logo,
"icon" => ImageKind.Icon,
"tile_fragment" => ImageKind.TileFragment,
"mask" => ImageKind.Mask,
"page_raster" => ImageKind.PageRaster,
"unknown" => ImageKind.Unknown,
_ => throw new JsonException($"Unknown ImageKind value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ImageKind value, JsonSerializerOptions options)
{
var str = value switch
{
ImageKind.Photograph => "photograph",
ImageKind.Diagram => "diagram",
ImageKind.Chart => "chart",
ImageKind.Drawing => "drawing",
ImageKind.TextBlock => "text_block",
ImageKind.Decoration => "decoration",
ImageKind.Logo => "logo",
ImageKind.Icon => "icon",
ImageKind.TileFragment => "tile_fragment",
ImageKind.Mask => "mask",
ImageKind.PageRaster => "page_raster",
ImageKind.Unknown => "unknown",
_ => throw new JsonException($"Unknown ImageKind value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,82 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image metadata extracted from image files.
///
/// Includes dimensions, format, and EXIF data.
/// </summary>
public sealed record ImageMetadata
{
/// <summary>
/// Image width in pixels
/// </summary>
[JsonPropertyName("width")]
public uint Width { get; init; } = 0;
/// <summary>
/// Image height in pixels
/// </summary>
[JsonPropertyName("height")]
public uint Height { get; init; } = 0;
/// <summary>
/// Image format (e.g., "PNG", "JPEG", "TIFF")
/// </summary>
[JsonPropertyName("format")]
public string Format { get; init; } = "";
/// <summary>
/// EXIF metadata tags
/// </summary>
[JsonPropertyName("exif")]
public Dictionary<string, string> Exif { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Parse a <see cref="ImageMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImageMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImageMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImageMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImageMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,93 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image element metadata.
/// </summary>
public sealed record ImageMetadataType
{
/// <summary>
/// Image source (URL, data URI, or SVG content)
/// </summary>
[JsonPropertyName("src")]
public required string Src { get; init; }
/// <summary>
/// Alternative text from alt attribute
/// </summary>
[JsonPropertyName("alt")]
public string? Alt { get; init; } = null;
/// <summary>
/// Title attribute
/// </summary>
[JsonPropertyName("title")]
public string? Title { get; init; } = null;
/// <summary>
/// Image dimensions as (width, height) if available
/// </summary>
[JsonPropertyName("dimensions")]
public List<uint>? Dimensions { get; init; } = null;
/// <summary>
/// Image type classification
/// </summary>
[JsonConverter(typeof(ImageTypeJsonConverter))]
[JsonPropertyName("image_type")]
public required ImageType ImageType { get; init; }
/// <summary>
/// Additional attributes as key-value pairs
/// </summary>
[JsonPropertyName("attributes")]
public List<List<string>> Attributes { get; init; } = [];
/// <summary>
/// Parse a <see cref="ImageMetadataType"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImageMetadataType FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImageMetadataType>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImageMetadataType from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImageMetadataType from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,112 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image preprocessing configuration for OCR.
///
/// These settings control how images are preprocessed before OCR to improve
/// text recognition quality. Different preprocessing strategies work better
/// for different document types.
/// </summary>
public sealed record ImagePreprocessingConfig
{
/// <summary>
/// Target DPI for the image (300 is standard, 600 for small text).
/// </summary>
[JsonPropertyName("target_dpi")]
public int TargetDpi { get; init; } = 300;
/// <summary>
/// Auto-detect and correct image rotation.
/// </summary>
[JsonPropertyName("auto_rotate")]
public bool AutoRotate { get; init; } = true;
/// <summary>
/// Correct skew (tilted images).
/// </summary>
[JsonPropertyName("deskew")]
public bool Deskew { get; init; } = true;
/// <summary>
/// Remove noise from the image.
/// </summary>
[JsonPropertyName("denoise")]
public bool Denoise { get; init; } = false;
/// <summary>
/// Enhance contrast for better text visibility.
/// </summary>
[JsonPropertyName("contrast_enhance")]
public bool ContrastEnhance { get; init; } = false;
/// <summary>
/// Binarization method: "otsu", "sauvola", "adaptive".
/// </summary>
[JsonPropertyName("binarization_method")]
public string BinarizationMethod { get; init; } = "otsu";
/// <summary>
/// Invert colors (white text on black → black on white).
/// </summary>
[JsonPropertyName("invert_colors")]
public bool InvertColors { get; init; } = false;
/// <summary>
/// Parse a <see cref="ImagePreprocessingConfig"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImagePreprocessingConfig FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImagePreprocessingConfig>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImagePreprocessingConfig from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImagePreprocessingConfig from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
public static ImagePreprocessingConfig Default()
{
var nativeResult = NativeMethods.ImagePreprocessingConfigDefault();
var jsonPtr = NativeMethods.ImagePreprocessingConfigToJson(nativeResult);
var json = global::System.Runtime.InteropServices.Marshal.PtrToStringUTF8(jsonPtr);
NativeMethods.FreeString(jsonPtr);
NativeMethods.ImagePreprocessingConfigFree(nativeResult);
return JsonSerializer.Deserialize<ImagePreprocessingConfig>(json ?? "null", JsonOptions)!;
}
}

View File

@@ -0,0 +1,131 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image preprocessing metadata.
///
/// Tracks the transformations applied to an image during OCR preprocessing,
/// including DPI normalization, resizing, and resampling.
/// </summary>
public sealed record ImagePreprocessingMetadata
{
/// <summary>
/// Original image dimensions (width, height) in pixels
/// </summary>
[JsonPropertyName("original_dimensions")]
public List<ulong> OriginalDimensions { get; init; } = [];
/// <summary>
/// Original image DPI (horizontal, vertical)
/// </summary>
[JsonPropertyName("original_dpi")]
public List<double> OriginalDpi { get; init; } = [];
/// <summary>
/// Target DPI from configuration
/// </summary>
[JsonPropertyName("target_dpi")]
public int TargetDpi { get; init; } = 0;
/// <summary>
/// Scaling factor applied to the image
/// </summary>
[JsonPropertyName("scale_factor")]
public double ScaleFactor { get; init; } = 0.0;
/// <summary>
/// Whether DPI was auto-adjusted based on content
/// </summary>
[JsonPropertyName("auto_adjusted")]
public bool AutoAdjusted { get; init; } = false;
/// <summary>
/// Final DPI after processing
/// </summary>
[JsonPropertyName("final_dpi")]
public int FinalDpi { get; init; } = 0;
/// <summary>
/// New dimensions after resizing (if resized)
/// </summary>
[JsonPropertyName("new_dimensions")]
public List<ulong>? NewDimensions { get; init; } = null;
/// <summary>
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
/// </summary>
[JsonPropertyName("resample_method")]
public required string ResampleMethod { get; init; }
/// <summary>
/// Whether dimensions were clamped to max_image_dimension
/// </summary>
[JsonPropertyName("dimension_clamped")]
public bool DimensionClamped { get; init; } = false;
/// <summary>
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
/// </summary>
[JsonPropertyName("calculated_dpi")]
public int? CalculatedDpi { get; init; } = null;
/// <summary>
/// Whether resize was skipped (dimensions already optimal)
/// </summary>
[JsonPropertyName("skipped_resize")]
public bool SkippedResize { get; init; } = false;
/// <summary>
/// Error message if resize failed
/// </summary>
[JsonPropertyName("resize_error")]
public string? ResizeError { get; init; } = null;
/// <summary>
/// Parse a <see cref="ImagePreprocessingMetadata"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static ImagePreprocessingMetadata FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<ImagePreprocessingMetadata>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse ImagePreprocessingMetadata from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse ImagePreprocessingMetadata from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class ImageProcessingException : KreuzbergErrorException
{
public ImageProcessingException(string message) : base(message) { }
public ImageProcessingException(string message, Exception innerException) : base(message, innerException) { }
}

View File

@@ -0,0 +1,69 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Image type classification.
/// </summary>
[JsonConverter(typeof(ImageTypeJsonConverter))]
public enum ImageType
{
/// <summary>
/// Data URI image
/// </summary>
[JsonPropertyName("data-uri")]
DataUri,
/// <summary>
/// Inline SVG
/// </summary>
[JsonPropertyName("inline-svg")]
InlineSvg,
/// <summary>
/// External image URL
/// </summary>
[JsonPropertyName("external")]
External,
/// <summary>
/// Relative path image
/// </summary>
[JsonPropertyName("relative")]
Relative,
}
/// <summary>
/// Custom JSON converter for <see cref="ImageType"/> that respects explicit variant names.
/// </summary>
internal sealed class ImageTypeJsonConverter : JsonConverter<ImageType>
{
public override ImageType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"data-uri" => ImageType.DataUri,
"inline-svg" => ImageType.InlineSvg,
"external" => ImageType.External,
"relative" => ImageType.Relative,
_ => throw new JsonException($"Unknown ImageType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, ImageType value, JsonSerializerOptions options)
{
var str = value switch
{
ImageType.DataUri => "data-uri",
ImageType.InlineSvg => "inline-svg",
ImageType.External => "external",
ImageType.Relative => "relative",
_ => throw new JsonException($"Unknown ImageType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,82 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Inline element within a block.
///
/// Represents text with formatting, links, images, etc.
/// </summary>
public sealed record InlineElement
{
/// <summary>
/// Type of inline element
/// </summary>
[JsonPropertyName("element_type")]
public required InlineType ElementType { get; init; }
/// <summary>
/// Text content
/// </summary>
[JsonPropertyName("content")]
public required string Content { get; init; }
/// <summary>
/// Element attributes
/// </summary>
[JsonPropertyName("attributes")]
public string? Attributes { get; init; } = null;
/// <summary>
/// Additional metadata (e.g., href for links, src/alt for images)
/// </summary>
[JsonPropertyName("metadata")]
public Dictionary<string, string>? Metadata { get; init; } = null;
/// <summary>
/// Parse a <see cref="InlineElement"/> from a JSON string.
/// </summary>
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
public static InlineElement FromJson(string json)
{
try
{
return JsonSerializer.Deserialize<InlineElement>(json, JsonOptions)
?? throw new KreuzbergException($"Failed to parse InlineElement from JSON: deserializer returned null");
}
catch (KreuzbergException)
{
throw;
}
catch (Exception e)
{
throw new KreuzbergException($"Failed to parse InlineElement from JSON: {e.Message}", e);
}
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
/// (nullable C# fields default to null and would override required Rust fields with
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
{
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
};
}

View File

@@ -0,0 +1,105 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace Kreuzberg;
/// <summary>
/// Types of inline elements in Djot.
/// </summary>
[JsonConverter(typeof(InlineTypeJsonConverter))]
public enum InlineType
{
[JsonPropertyName("text")]
Text,
[JsonPropertyName("strong")]
Strong,
[JsonPropertyName("emphasis")]
Emphasis,
[JsonPropertyName("highlight")]
Highlight,
[JsonPropertyName("subscript")]
Subscript,
[JsonPropertyName("superscript")]
Superscript,
[JsonPropertyName("insert")]
Insert,
[JsonPropertyName("delete")]
Delete,
[JsonPropertyName("code")]
Code,
[JsonPropertyName("link")]
Link,
[JsonPropertyName("image")]
Image,
[JsonPropertyName("span")]
Span,
[JsonPropertyName("math")]
Math,
[JsonPropertyName("raw_inline")]
RawInline,
[JsonPropertyName("footnote_ref")]
FootnoteRef,
[JsonPropertyName("symbol")]
Symbol,
}
/// <summary>
/// Custom JSON converter for <see cref="InlineType"/> that respects explicit variant names.
/// </summary>
internal sealed class InlineTypeJsonConverter : JsonConverter<InlineType>
{
public override InlineType Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"text" => InlineType.Text,
"strong" => InlineType.Strong,
"emphasis" => InlineType.Emphasis,
"highlight" => InlineType.Highlight,
"subscript" => InlineType.Subscript,
"superscript" => InlineType.Superscript,
"insert" => InlineType.Insert,
"delete" => InlineType.Delete,
"code" => InlineType.Code,
"link" => InlineType.Link,
"image" => InlineType.Image,
"span" => InlineType.Span,
"math" => InlineType.Math,
"raw_inline" => InlineType.RawInline,
"footnote_ref" => InlineType.FootnoteRef,
"symbol" => InlineType.Symbol,
_ => throw new JsonException($"Unknown InlineType value: {value}")
};
}
public override void Write(Utf8JsonWriter writer, InlineType value, JsonSerializerOptions options)
{
var str = value switch
{
InlineType.Text => "text",
InlineType.Strong => "strong",
InlineType.Emphasis => "emphasis",
InlineType.Highlight => "highlight",
InlineType.Subscript => "subscript",
InlineType.Superscript => "superscript",
InlineType.Insert => "insert",
InlineType.Delete => "delete",
InlineType.Code => "code",
InlineType.Link => "link",
InlineType.Image => "image",
InlineType.Span => "span",
InlineType.Math => "math",
InlineType.RawInline => "raw_inline",
InlineType.FootnoteRef => "footnote_ref",
InlineType.Symbol => "symbol",
_ => throw new JsonException($"Unknown InlineType value: {value}")
};
writer.WriteStringValue(str);
}
}

View File

@@ -0,0 +1,14 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
#nullable enable
using System;
namespace Kreuzberg;
public class IoException : KreuzbergErrorException
{
public IoException(string message) : base(message) { }
public IoException(string message, Exception innerException) : base(message, innerException) { }
}

Some files were not shown because too many files have changed in this diff Show More