This commit is contained in:
304
.ai-rulez/.generated-manifest.json
Normal file
304
.ai-rulez/.generated-manifest.json
Normal file
@@ -0,0 +1,304 @@
|
||||
{
|
||||
"version": "1",
|
||||
"files": [
|
||||
".agents/agents/c-ffi-specialist.md",
|
||||
".agents/agents/code-reviewer.md",
|
||||
".agents/agents/csharp-specialist.md",
|
||||
".agents/agents/dart-specialist.md",
|
||||
".agents/agents/devops-engineer.md",
|
||||
".agents/agents/docs-writer.md",
|
||||
".agents/agents/e2e-generator-engineer.md",
|
||||
".agents/agents/elixir-specialist.md",
|
||||
".agents/agents/extraction-engineer.md",
|
||||
".agents/agents/ffi-engineer.md",
|
||||
".agents/agents/go-specialist.md",
|
||||
".agents/agents/java-specialist.md",
|
||||
".agents/agents/jni-specialist.md",
|
||||
".agents/agents/kotlin-android-specialist.md",
|
||||
".agents/agents/kreuzberg-developer.md",
|
||||
".agents/agents/ocr-engineer.md",
|
||||
".agents/agents/performance-engineer.md",
|
||||
".agents/agents/php-specialist.md",
|
||||
".agents/agents/plugin-engineer.md",
|
||||
".agents/agents/polyglot-architect.md",
|
||||
".agents/agents/python-specialist.md",
|
||||
".agents/agents/r-specialist.md",
|
||||
".agents/agents/release-engineer.md",
|
||||
".agents/agents/ruby-specialist.md",
|
||||
".agents/agents/rust-core-engineer.md",
|
||||
".agents/agents/security-auditor.md",
|
||||
".agents/agents/swift-specialist.md",
|
||||
".agents/agents/test-writer.md",
|
||||
".agents/agents/typescript-specialist.md",
|
||||
".agents/agents/wasm-specialist.md",
|
||||
".agents/agents/zig-specialist.md",
|
||||
".agents/settings.json",
|
||||
".agents/skills/add-language-generator/SKILL.md",
|
||||
".agents/skills/alef/SKILL.md",
|
||||
".agents/skills/alef/references/adapters.md",
|
||||
".agents/skills/alef/references/backends.md",
|
||||
".agents/skills/alef/references/cli-reference.md",
|
||||
".agents/skills/alef/references/configuration.md",
|
||||
".agents/skills/alef/references/designing-alef-toml.md",
|
||||
".agents/skills/alef/references/e2e-testing.md",
|
||||
".agents/skills/alef/references/troubleshooting.md",
|
||||
".agents/skills/api-server-mcp/SKILL.md",
|
||||
".agents/skills/chunking-embeddings/SKILL.md",
|
||||
".agents/skills/common-task-commands/SKILL.md",
|
||||
".agents/skills/create-e2e-fixture/SKILL.md",
|
||||
".agents/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".agents/skills/format-specific-extraction/SKILL.md",
|
||||
".agents/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".agents/skills/quick-start/SKILL.md",
|
||||
".claude/agents/c-ffi-specialist.md",
|
||||
".claude/agents/code-reviewer.md",
|
||||
".claude/agents/csharp-specialist.md",
|
||||
".claude/agents/dart-specialist.md",
|
||||
".claude/agents/devops-engineer.md",
|
||||
".claude/agents/docs-writer.md",
|
||||
".claude/agents/e2e-generator-engineer.md",
|
||||
".claude/agents/elixir-specialist.md",
|
||||
".claude/agents/extraction-engineer.md",
|
||||
".claude/agents/ffi-engineer.md",
|
||||
".claude/agents/go-specialist.md",
|
||||
".claude/agents/java-specialist.md",
|
||||
".claude/agents/jni-specialist.md",
|
||||
".claude/agents/kotlin-android-specialist.md",
|
||||
".claude/agents/kreuzberg-developer.md",
|
||||
".claude/agents/ocr-engineer.md",
|
||||
".claude/agents/performance-engineer.md",
|
||||
".claude/agents/php-specialist.md",
|
||||
".claude/agents/plugin-engineer.md",
|
||||
".claude/agents/polyglot-architect.md",
|
||||
".claude/agents/python-specialist.md",
|
||||
".claude/agents/r-specialist.md",
|
||||
".claude/agents/release-engineer.md",
|
||||
".claude/agents/ruby-specialist.md",
|
||||
".claude/agents/rust-core-engineer.md",
|
||||
".claude/agents/security-auditor.md",
|
||||
".claude/agents/swift-specialist.md",
|
||||
".claude/agents/test-writer.md",
|
||||
".claude/agents/typescript-specialist.md",
|
||||
".claude/agents/wasm-specialist.md",
|
||||
".claude/agents/zig-specialist.md",
|
||||
".claude/settings.json",
|
||||
".claude/skills/add-language-generator/SKILL.md",
|
||||
".claude/skills/alef/SKILL.md",
|
||||
".claude/skills/alef/references/adapters.md",
|
||||
".claude/skills/alef/references/backends.md",
|
||||
".claude/skills/alef/references/cli-reference.md",
|
||||
".claude/skills/alef/references/configuration.md",
|
||||
".claude/skills/alef/references/designing-alef-toml.md",
|
||||
".claude/skills/alef/references/e2e-testing.md",
|
||||
".claude/skills/alef/references/troubleshooting.md",
|
||||
".claude/skills/api-server-mcp/SKILL.md",
|
||||
".claude/skills/chunking-embeddings/SKILL.md",
|
||||
".claude/skills/common-task-commands/SKILL.md",
|
||||
".claude/skills/create-e2e-fixture/SKILL.md",
|
||||
".claude/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".claude/skills/format-specific-extraction/SKILL.md",
|
||||
".claude/skills/iterate/SKILL.md",
|
||||
".claude/skills/parallelize/SKILL.md",
|
||||
".claude/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".claude/skills/quick-start/SKILL.md",
|
||||
".codex/agents/c-ffi-specialist.toml",
|
||||
".codex/agents/code-reviewer.toml",
|
||||
".codex/agents/csharp-specialist.toml",
|
||||
".codex/agents/dart-specialist.toml",
|
||||
".codex/agents/devops-engineer.toml",
|
||||
".codex/agents/docs-writer.toml",
|
||||
".codex/agents/e2e-generator-engineer.toml",
|
||||
".codex/agents/elixir-specialist.toml",
|
||||
".codex/agents/extraction-engineer.toml",
|
||||
".codex/agents/ffi-engineer.toml",
|
||||
".codex/agents/go-specialist.toml",
|
||||
".codex/agents/java-specialist.toml",
|
||||
".codex/agents/jni-specialist.toml",
|
||||
".codex/agents/kotlin-android-specialist.toml",
|
||||
".codex/agents/kreuzberg-developer.toml",
|
||||
".codex/agents/ocr-engineer.toml",
|
||||
".codex/agents/performance-engineer.toml",
|
||||
".codex/agents/php-specialist.toml",
|
||||
".codex/agents/plugin-engineer.toml",
|
||||
".codex/agents/polyglot-architect.toml",
|
||||
".codex/agents/python-specialist.toml",
|
||||
".codex/agents/r-specialist.toml",
|
||||
".codex/agents/release-engineer.toml",
|
||||
".codex/agents/ruby-specialist.toml",
|
||||
".codex/agents/rust-core-engineer.toml",
|
||||
".codex/agents/security-auditor.toml",
|
||||
".codex/agents/swift-specialist.toml",
|
||||
".codex/agents/test-writer.toml",
|
||||
".codex/agents/typescript-specialist.toml",
|
||||
".codex/agents/wasm-specialist.toml",
|
||||
".codex/agents/zig-specialist.toml",
|
||||
".codex/commands/iterate.md",
|
||||
".codex/commands/parallelize.md",
|
||||
".codex/skills/add-language-generator/SKILL.md",
|
||||
".codex/skills/alef/SKILL.md",
|
||||
".codex/skills/alef/references/adapters.md",
|
||||
".codex/skills/alef/references/backends.md",
|
||||
".codex/skills/alef/references/cli-reference.md",
|
||||
".codex/skills/alef/references/configuration.md",
|
||||
".codex/skills/alef/references/designing-alef-toml.md",
|
||||
".codex/skills/alef/references/e2e-testing.md",
|
||||
".codex/skills/alef/references/troubleshooting.md",
|
||||
".codex/skills/api-server-mcp/SKILL.md",
|
||||
".codex/skills/chunking-embeddings/SKILL.md",
|
||||
".codex/skills/common-task-commands/SKILL.md",
|
||||
".codex/skills/create-e2e-fixture/SKILL.md",
|
||||
".codex/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".codex/skills/format-specific-extraction/SKILL.md",
|
||||
".codex/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".codex/skills/quick-start/SKILL.md",
|
||||
".cursor/commands/iterate.md",
|
||||
".cursor/commands/parallelize.md",
|
||||
".cursor/rules/agent-workflow.mdc",
|
||||
".cursor/rules/alef-generated-bindings.mdc",
|
||||
".cursor/rules/alef-workflow.mdc",
|
||||
".cursor/rules/anti-patterns.mdc",
|
||||
".cursor/rules/api-compatibility.mdc",
|
||||
".cursor/rules/async-and-concurrency.mdc",
|
||||
".cursor/rules/atomic-commits.mdc",
|
||||
".cursor/rules/avoid-duplication.mdc",
|
||||
".cursor/rules/batch-operations.mdc",
|
||||
".cursor/rules/bindings.mdc",
|
||||
".cursor/rules/branch-hygiene.mdc",
|
||||
".cursor/rules/cache-and-performance.mdc",
|
||||
".cursor/rules/cgo-bindings.mdc",
|
||||
".cursor/rules/cicd-pipeline-standards.mdc",
|
||||
".cursor/rules/commit-messages.mdc",
|
||||
".cursor/rules/communication-style.mdc",
|
||||
".cursor/rules/complexity-limits.mdc",
|
||||
".cursor/rules/containerization-docker.mdc",
|
||||
".cursor/rules/context-config-loading-precedence.mdc",
|
||||
".cursor/rules/context-crate-structure.mdc",
|
||||
".cursor/rules/context-kreuzberg-brand-and-docs.mdc",
|
||||
".cursor/rules/context-mime-detection-routing.mdc",
|
||||
".cursor/rules/context-owasp-quick-reference.mdc",
|
||||
".cursor/rules/context-polyrepo-structure.mdc",
|
||||
".cursor/rules/context-pre-commit-tooling.mdc",
|
||||
".cursor/rules/context-prek.mdc",
|
||||
".cursor/rules/context-preservation.mdc",
|
||||
".cursor/rules/context-taskfile-structure.mdc",
|
||||
".cursor/rules/context-wasm-constraints.mdc",
|
||||
".cursor/rules/csharp-conventions.mdc",
|
||||
".cursor/rules/dead-code.mdc",
|
||||
".cursor/rules/dependency-awareness.mdc",
|
||||
".cursor/rules/e2e-generator-conventions.mdc",
|
||||
".cursor/rules/elixir-conventions.mdc",
|
||||
".cursor/rules/error-handling.mdc",
|
||||
".cursor/rules/explain-reasoning.mdc",
|
||||
".cursor/rules/ext-php-rs-bindings.mdc",
|
||||
".cursor/rules/extendr-bindings.mdc",
|
||||
".cursor/rules/extraction-quality.mdc",
|
||||
".cursor/rules/extraction-safety.mdc",
|
||||
".cursor/rules/feature-flag-policy.mdc",
|
||||
".cursor/rules/ffi-and-language-interop.mdc",
|
||||
".cursor/rules/fixture-schema-design.mdc",
|
||||
".cursor/rules/gcloud-conventions.mdc",
|
||||
".cursor/rules/generated-code-policy.mdc",
|
||||
".cursor/rules/gh-workflows.mdc",
|
||||
".cursor/rules/go-conventions.mdc",
|
||||
".cursor/rules/incremental-approach.mdc",
|
||||
".cursor/rules/input-validation.mdc",
|
||||
".cursor/rules/java-conventions.mdc",
|
||||
".cursor/rules/least-privilege.mdc",
|
||||
".cursor/rules/magnus-bindings.mdc",
|
||||
".cursor/rules/meaningful-assertions.mdc",
|
||||
".cursor/rules/minimal-changes.mdc",
|
||||
".cursor/rules/monitoring-observability.mdc",
|
||||
".cursor/rules/napi-rs-bindings.mdc",
|
||||
".cursor/rules/no-ai-signatures.mdc",
|
||||
".cursor/rules/ocr-backend-standards.mdc",
|
||||
".cursor/rules/ocr-language-and-config.mdc",
|
||||
".cursor/rules/ocr-performance.mdc",
|
||||
".cursor/rules/ocr-quality.mdc",
|
||||
".cursor/rules/ocr-table-and-hocr.mdc",
|
||||
".cursor/rules/output-awareness.mdc",
|
||||
".cursor/rules/php-conventions.mdc",
|
||||
".cursor/rules/plugin-extensibility.mdc",
|
||||
".cursor/rules/plugin-interface-contract.mdc",
|
||||
".cursor/rules/plugin-registry-and-selection.mdc",
|
||||
".cursor/rules/plugin-testing.mdc",
|
||||
".cursor/rules/pyo3-bindings.mdc",
|
||||
".cursor/rules/python-conventions.mdc",
|
||||
".cursor/rules/python-ffi-plugins.mdc",
|
||||
".cursor/rules/r-conventions.mdc",
|
||||
".cursor/rules/read-before-write.mdc",
|
||||
".cursor/rules/readability-first.mdc",
|
||||
".cursor/rules/ruby-conventions.mdc",
|
||||
".cursor/rules/rust-conventions.mdc",
|
||||
".cursor/rules/rust-polyglot-conventions.mdc",
|
||||
".cursor/rules/rustler-bindings.mdc",
|
||||
".cursor/rules/safe-git-operations.mdc",
|
||||
".cursor/rules/secrets-handling.mdc",
|
||||
".cursor/rules/systematic-debugging.mdc",
|
||||
".cursor/rules/task-automation-build.mdc",
|
||||
".cursor/rules/task-runner.mdc",
|
||||
".cursor/rules/tdd-workflow.mdc",
|
||||
".cursor/rules/test-alongside-code.mdc",
|
||||
".cursor/rules/test-independence.mdc",
|
||||
".cursor/rules/test-naming.mdc",
|
||||
".cursor/rules/testing-anti-patterns.mdc",
|
||||
".cursor/rules/typescript-conventions.mdc",
|
||||
".cursor/rules/verification-before-completion.mdc",
|
||||
".cursor/rules/verify-before-acting.mdc",
|
||||
".cursor/rules/wasm-bindings.mdc",
|
||||
".github/agents/c-ffi-specialist.agent.md",
|
||||
".github/agents/code-reviewer.agent.md",
|
||||
".github/agents/csharp-specialist.agent.md",
|
||||
".github/agents/dart-specialist.agent.md",
|
||||
".github/agents/devops-engineer.agent.md",
|
||||
".github/agents/docs-writer.agent.md",
|
||||
".github/agents/e2e-generator-engineer.agent.md",
|
||||
".github/agents/elixir-specialist.agent.md",
|
||||
".github/agents/extraction-engineer.agent.md",
|
||||
".github/agents/ffi-engineer.agent.md",
|
||||
".github/agents/go-specialist.agent.md",
|
||||
".github/agents/java-specialist.agent.md",
|
||||
".github/agents/jni-specialist.agent.md",
|
||||
".github/agents/kotlin-android-specialist.agent.md",
|
||||
".github/agents/kreuzberg-developer.agent.md",
|
||||
".github/agents/ocr-engineer.agent.md",
|
||||
".github/agents/performance-engineer.agent.md",
|
||||
".github/agents/php-specialist.agent.md",
|
||||
".github/agents/plugin-engineer.agent.md",
|
||||
".github/agents/polyglot-architect.agent.md",
|
||||
".github/agents/python-specialist.agent.md",
|
||||
".github/agents/r-specialist.agent.md",
|
||||
".github/agents/release-engineer.agent.md",
|
||||
".github/agents/ruby-specialist.agent.md",
|
||||
".github/agents/rust-core-engineer.agent.md",
|
||||
".github/agents/security-auditor.agent.md",
|
||||
".github/agents/swift-specialist.agent.md",
|
||||
".github/agents/test-writer.agent.md",
|
||||
".github/agents/typescript-specialist.agent.md",
|
||||
".github/agents/wasm-specialist.agent.md",
|
||||
".github/agents/zig-specialist.agent.md",
|
||||
".github/commands/iterate.md",
|
||||
".github/commands/parallelize.md",
|
||||
".github/copilot-instructions.md",
|
||||
".github/skills/add-language-generator/SKILL.md",
|
||||
".github/skills/alef/SKILL.md",
|
||||
".github/skills/alef/references/adapters.md",
|
||||
".github/skills/alef/references/backends.md",
|
||||
".github/skills/alef/references/cli-reference.md",
|
||||
".github/skills/alef/references/configuration.md",
|
||||
".github/skills/alef/references/designing-alef-toml.md",
|
||||
".github/skills/alef/references/e2e-testing.md",
|
||||
".github/skills/alef/references/troubleshooting.md",
|
||||
".github/skills/api-server-mcp/SKILL.md",
|
||||
".github/skills/chunking-embeddings/SKILL.md",
|
||||
".github/skills/common-task-commands/SKILL.md",
|
||||
".github/skills/create-e2e-fixture/SKILL.md",
|
||||
".github/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".github/skills/format-specific-extraction/SKILL.md",
|
||||
".github/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".github/skills/quick-start/SKILL.md",
|
||||
".mcp.json",
|
||||
"AGENTS.md",
|
||||
"CLAUDE.md",
|
||||
"GEMINI.md"
|
||||
]
|
||||
}
|
||||
17
.ai-rulez/agents/kreuzberg-developer.md
Normal file
17
.ai-rulez/agents/kreuzberg-developer.md
Normal file
@@ -0,0 +1,17 @@
|
||||
---
|
||||
name: kreuzberg-developer
|
||||
description: General kreuzberg development guidance and cross-cutting concerns
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on kreuzberg:
|
||||
|
||||
1. Rust core is the single source of truth — all business logic in crates/kreuzberg/src/
|
||||
2. Bindings (Python, TypeScript, Ruby, PHP, etc.) are thin wrappers — never duplicate core logic
|
||||
3. Use `task` commands for all operations: `task build` is core-only; use `task build:bindings` or `task build:all` explicitly when bindings are needed
|
||||
4. Build FFI layer first if needed: `task build:bindings`
|
||||
5. For ONNX features: ensure ORT_LIB_LOCATION is set or use download-binaries feature
|
||||
6. All unsafe blocks require SAFETY comments. No .unwrap() in production code.
|
||||
7. Coverage targets: 95% for Rust core, 80% for bindings
|
||||
8. WASM builds are sync-only — implement SyncExtractor for WASM-compatible extractors
|
||||
9. Version in root Cargo.toml is the single source of truth for all binding packages
|
||||
71
.ai-rulez/config.toml
Normal file
71
.ai-rulez/config.toml
Normal file
@@ -0,0 +1,71 @@
|
||||
# AI-Rulez Configuration (migrated to V4 TOML format)
|
||||
# Documentation: https://github.com/Goldziher/ai-rulez
|
||||
|
||||
version = '4.0'
|
||||
name = 'Kreuzberg'
|
||||
description = 'Rust document intelligence library with active Python, TypeScript/Node, Ruby, PHP, Go, Java, C#, Elixir, R, WebAssembly, Dart, Kotlin Android, Swift, Zig, and C FFI bindings'
|
||||
gitignore = true
|
||||
presets = ['claude', 'copilot', 'cursor', 'antigravity', 'codex']
|
||||
builtins = [
|
||||
'rust',
|
||||
'python',
|
||||
'go',
|
||||
'java',
|
||||
'ruby',
|
||||
'php',
|
||||
'csharp',
|
||||
'elixir',
|
||||
'r',
|
||||
'wasm',
|
||||
'pyo3',
|
||||
'napi-rs',
|
||||
'magnus',
|
||||
'ext-php-rs',
|
||||
'rustler',
|
||||
'cgo',
|
||||
'extendr',
|
||||
'default-commands',
|
||||
]
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-core'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/core'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-languages'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/languages'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-cicd'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/cicd'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-infrastructure'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/infrastructure'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-e2e-generator'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/e2e-generator'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[installed_skills]]
|
||||
name = 'alef'
|
||||
source = 'https://github.com/kreuzberg-dev/alef.git'
|
||||
|
||||
[[mcp_servers]]
|
||||
name = 'playwright'
|
||||
description = 'Playwright browser automation for E2E testing and docs verification'
|
||||
command = 'npx'
|
||||
args = ['-y', '@playwright/mcp@latest']
|
||||
|
||||
[defaults]
|
||||
effort = 'medium'
|
||||
65
.ai-rulez/context/config-loading-precedence.md
Normal file
65
.ai-rulez/context/config-loading-precedence.md
Normal file
@@ -0,0 +1,65 @@
|
||||
---
|
||||
summary: Configuration loading precedence for CLI and server modes
|
||||
---
|
||||
|
||||
# Configuration Loading & Precedence
|
||||
|
||||
## CLI Mode Precedence (highest to lowest)
|
||||
|
||||
1. Individual CLI flags (`--ocr`, `--output-format`, `--chunk`)
|
||||
2. Inline JSON config (`--config-json` or `--config-json-base64`)
|
||||
3. Config file (`--config path.toml`)
|
||||
4. Auto-discovered config (`kreuzberg.{toml,yaml,json}` in cwd/parents)
|
||||
5. Default values
|
||||
|
||||
## Server/MCP Mode Precedence
|
||||
|
||||
1. CLI arguments (`--host`, `--port`)
|
||||
2. Environment variables (`KREUZBERG_HOST`, `KREUZBERG_PORT`)
|
||||
3. Config file `[server]` section
|
||||
4. Defaults (`127.0.0.1:8000`)
|
||||
|
||||
## Config File Discovery
|
||||
|
||||
Searches current directory and parents for `kreuzberg.toml`, `kreuzberg.yaml`, or `kreuzberg.json`. Stops at first match.
|
||||
|
||||
## Inline JSON Config
|
||||
|
||||
Field-level merge (not whole-object replacement):
|
||||
|
||||
```rust
|
||||
fn merge_json_into_config(base: &ExtractionConfig, json: Value) -> Result<ExtractionConfig> {
|
||||
let mut config_json = serde_json::to_value(base)?;
|
||||
// Merge fields from json into config_json
|
||||
serde_json::from_value(merged)?
|
||||
}
|
||||
```
|
||||
|
||||
Use `--config-json-base64` for shell escaping.
|
||||
|
||||
## Config File Formats
|
||||
|
||||
**TOML** (`kreuzberg.toml`):
|
||||
|
||||
```toml
|
||||
use_cache = true
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
languages = ["eng", "deu"]
|
||||
[security_limits]
|
||||
max_archive_size = 524288000
|
||||
```
|
||||
|
||||
**YAML** and **JSON** follow equivalent structure.
|
||||
|
||||
## CLI Flag Overrides
|
||||
|
||||
In `commands.rs`: `apply_extraction_overrides()` applies individual flags on top of merged config.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. CLI flags always win over config file
|
||||
2. JSON merge is field-level, not whole-object
|
||||
3. Auto-discovery stops at first config file found
|
||||
4. `--config-json-base64` for shell-safe JSON passing
|
||||
5. Server config uses `[server]` section + extraction config
|
||||
36
.ai-rulez/context/crate-structure.md
Normal file
36
.ai-rulez/context/crate-structure.md
Normal file
@@ -0,0 +1,36 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
# Crate Structure
|
||||
|
||||
Version source of truth: root `Cargo.toml` `[workspace.package] version`.
|
||||
|
||||
## Workspace crates (`crates/`)
|
||||
|
||||
- `kreuzberg` — core library: extraction engine, MIME detection, plugin system, OCR, chunking, embeddings, API/MCP server
|
||||
- `kreuzberg-cli` — CLI binary; thin wrapper over core with `cli` feature set
|
||||
- `kreuzberg-ffi` — C FFI layer (`#[no_mangle] extern "C"`); opaque handles, cbindgen headers; used by Go, Java, C# bindings
|
||||
- `kreuzberg-node` — NAPI-RS Node.js/TypeScript bindings
|
||||
- `kreuzberg-py` — PyO3 Python bindings
|
||||
- `kreuzberg-php` — ext-php-rs PHP bindings
|
||||
- `kreuzberg-wasm` — wasm-bindgen WASM bindings; uses `wasm-target` feature set
|
||||
- `kreuzberg-paddle-ocr` — PaddleOCR via ONNX Runtime; not available on WASM or Windows
|
||||
- `kreuzberg-tesseract` — Rust bindings for Tesseract OCR
|
||||
|
||||
## Out-of-workspace bindings (`packages/`)
|
||||
|
||||
- `packages/python/` — PyPI (maturin + PyO3)
|
||||
- `packages/typescript/` — npm type declarations
|
||||
- `packages/ruby/` — RubyGems (Magnus); native ext compiled by `rake`
|
||||
- `packages/php/` — Composer (ext-php-rs)
|
||||
- `packages/go/v5/` — Go module; cgo over kreuzberg-ffi
|
||||
- `packages/java/` — Maven; Foreign Function & Memory API over kreuzberg-ffi
|
||||
- `packages/csharp/` — NuGet; P/Invoke over kreuzberg-ffi
|
||||
- `packages/elixir/` — Hex; Rustler NIF (workspace member at `packages/elixir/native/kreuzberg_rustler`)
|
||||
- `packages/r/` — CRAN; extendr (excluded from workspace)
|
||||
|
||||
## Tools (`tools/`)
|
||||
|
||||
- `tools/e2e-generator` — reads JSON fixtures, generates runnable test suites per language into `e2e/`
|
||||
- `tools/benchmark-harness` — criterion-based benchmark runner
|
||||
56
.ai-rulez/context/mime-detection-routing.md
Normal file
56
.ai-rulez/context/mime-detection-routing.md
Normal file
@@ -0,0 +1,56 @@
|
||||
---
|
||||
summary: MIME type detection and extractor routing logic
|
||||
---
|
||||
|
||||
# MIME Detection & Routing
|
||||
|
||||
## Detection Flow
|
||||
|
||||
```text
|
||||
Extension -> EXT_TO_MIME map -> validate -> Registry lookup -> Extractor
|
||||
```
|
||||
|
||||
## Key Functions
|
||||
|
||||
| Function | Location | Purpose |
|
||||
| ------------------------------------ | -------------- | --------------------------------------- |
|
||||
| `detect_mime_type(path, inspect)` | `core/mime.rs` | Extension + optional content inspection |
|
||||
| `detect_mime_type_from_bytes(bytes)` | `core/mime.rs` | Magic number detection (infer crate) |
|
||||
| `validate_mime_type(mime)` | `core/mime.rs` | Check if any extractor supports it |
|
||||
|
||||
## Extension Mapping
|
||||
|
||||
118+ extensions mapped in `EXT_TO_MIME` (`core/mime.rs`). Case-insensitive.
|
||||
|
||||
Key mappings: `.pdf` -> `application/pdf`, `.docx` -> `application/vnd.openxmlformats-officedocument.wordprocessingml.document`, `.xlsx` -> spreadsheet variant, `.png`/`.jpg` -> `image/*`
|
||||
|
||||
## Registry Selection
|
||||
|
||||
```rust
|
||||
// In core/extractor/bytes.rs
|
||||
fn select_extractor_for_mime(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
|
||||
let registry = get_document_extractor_registry();
|
||||
let registry_guard = registry.read()?;
|
||||
registry_guard.get_for_mime_type(mime_type)
|
||||
.ok_or_else(|| KreuzbergError::UnsupportedFormat(mime_type.into()))
|
||||
}
|
||||
```
|
||||
|
||||
Selects highest-priority extractor registered for that MIME type.
|
||||
|
||||
## Adding New MIME Types
|
||||
|
||||
1. Add extension mapping: `m.insert("ext", "application/x-new");` in `core/mime.rs`
|
||||
2. Implement `DocumentExtractor` with `supported_mime_types()` returning the MIME
|
||||
3. Register in `register_default_extractors()`
|
||||
|
||||
## Wildcard Support
|
||||
|
||||
Extractors can register for MIME type families: `"image/*"` matches `image/png`, `image/jpeg`, etc.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. Always `validate_mime_type()` before extraction
|
||||
2. Extension mapping is case-insensitive
|
||||
3. Content inspection (infer crate) is fallback for extension-less files
|
||||
4. Registry validation is final authority on supported types
|
||||
78
.ai-rulez/context/wasm-constraints.md
Normal file
78
.ai-rulez/context/wasm-constraints.md
Normal file
@@ -0,0 +1,78 @@
|
||||
---
|
||||
summary: WASM build constraints and patterns for kreuzberg-wasm crate
|
||||
---
|
||||
|
||||
# WASM Build Constraints
|
||||
|
||||
## Overview
|
||||
|
||||
WASM target in `crates/kreuzberg-wasm/`. Uses wasm-bindgen with sync-only internal APIs.
|
||||
|
||||
## Feature Flags
|
||||
|
||||
```toml
|
||||
[features]
|
||||
wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality", "office"]
|
||||
wasm-threads = ["dep:wasm-bindgen-rayon"] # Optional
|
||||
```
|
||||
|
||||
## Critical Constraints
|
||||
|
||||
### 1. No Tokio Runtime
|
||||
|
||||
All operations must be synchronous internally. Use `#[cfg(not(feature = "tokio-runtime"))]` paths.
|
||||
|
||||
### 2. SyncExtractor Required
|
||||
|
||||
Every WASM-compatible extractor MUST implement `SyncExtractor`:
|
||||
|
||||
```rust
|
||||
impl SyncExtractor for MyExtractor {
|
||||
fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
|
||||
-> Result<ExtractionResult> { /* sync implementation */ }
|
||||
}
|
||||
|
||||
impl DocumentExtractor for MyExtractor {
|
||||
fn as_sync_extractor(&self) -> Option<&dyn SyncExtractor> {
|
||||
Some(self) // MUST return Some for WASM
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. HTML Size Limit
|
||||
|
||||
```rust
|
||||
const MAX_HTML_SIZE: usize = 2 * 1024 * 1024; // 2MB - stack constraint
|
||||
```
|
||||
|
||||
## Build Config
|
||||
|
||||
```toml
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
[profile.release.package.kreuzberg-wasm]
|
||||
opt-level = "z" # Size optimization
|
||||
codegen-units = 1
|
||||
```
|
||||
|
||||
## API Pattern
|
||||
|
||||
```rust
|
||||
#[wasm_bindgen]
|
||||
pub async fn extract_from_bytes(content: Vec<u8>, config: JsValue) -> Result<JsValue, JsValue> {
|
||||
let config: ExtractionConfig = serde_wasm_bindgen::from_value(config)?;
|
||||
let result = extract_bytes_sync(&content, mime_type, &config)?;
|
||||
Ok(serde_wasm_bindgen::to_value(&result)?)
|
||||
}
|
||||
```
|
||||
|
||||
Functions can be `async` for JS compatibility, but internal extraction is sync.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **No tokio** -- all operations synchronous
|
||||
2. **Implement SyncExtractor** for all WASM-compatible extractors
|
||||
3. **HTML limited to 2MB** due to stack constraints
|
||||
4. **Size optimization** via `opt-level = "z"`
|
||||
5. **Feature gate** with `#[cfg(target_arch = "wasm32")]`
|
||||
12
.ai-rulez/domains/document-extraction/DOMAIN.md
Normal file
12
.ai-rulez/domains/document-extraction/DOMAIN.md
Normal file
@@ -0,0 +1,12 @@
|
||||
---
|
||||
description: Document extraction pipeline architecture
|
||||
---
|
||||
|
||||
- Pipeline: file input → MIME detection (magic bytes + extension) → extractor routing → extraction → post-processing → ExtractionResult
|
||||
- Extractors are plugins implementing the Extractor trait: extract(&self, source: &ExtractionSource) → ExtractionResult
|
||||
- Fallback chains: if primary extractor fails, try next in priority order (e.g., native PDF → Tesseract OCR → error)
|
||||
- Cache-first: check extraction cache before running extractors, cache results keyed by content hash
|
||||
- ExtractionResult contains: text content, metadata (page count, language, confidence), optional structured data (tables, images)
|
||||
- Async-first: all extraction paths are async, use spawn_blocking for CPU-bound work (OCR, image processing)
|
||||
- Memory limits: streaming for large files, configurable max file size, depth limits for nested archives
|
||||
- Format coverage: 91+ formats — PDF, DOCX, XLSX, PPTX, HTML, images, email (EML/MSG), archives, plain text
|
||||
@@ -0,0 +1,16 @@
|
||||
---
|
||||
name: extraction-engineer
|
||||
description: Document extraction pipeline development and maintenance
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on document extraction code:
|
||||
|
||||
1. Key source paths: crates/kreuzberg/src/core/ (extractor.rs, mime.rs, config.rs), crates/kreuzberg/src/extraction/
|
||||
2. The extraction pipeline: Input -> Cache Check -> MIME Detection -> Format Conversion -> Extractor Selection (priority-based) -> Extraction -> Fallback Chain -> Post-Processing -> Caching -> Output
|
||||
3. For MIME detection: use EXT_TO_MIME map + magic bytes fallback via infer crate. Always validate_mime_type() before extraction.
|
||||
4. For caching: keys based on content hash, invalidate on config changes
|
||||
5. For errors: implement fallback chains (try next-priority extractor), preserve partial results, return structured error info
|
||||
6. For new formats: add to EXT_TO_MIME, implement DocumentExtractor trait, register in register_default_extractors()
|
||||
7. Always use SecurityLimits validators for user content (ZipBombValidator, DepthValidator, StringGrowthValidator)
|
||||
8. Run `task test` after changes. Target 95% coverage on core extraction code.
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Follow semantic versioning — breaking changes require major version bump
|
||||
- Document all public API changes in CHANGELOG.md
|
||||
- Maintain backward compatibility for at least one minor version before removing deprecated APIs
|
||||
- All public types must be FFI-friendly or have FFI-compatible equivalents
|
||||
- Version in Cargo.toml is the single source of truth for all binding packages
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- All extraction paths must be fully async using tokio
|
||||
- Never block the async runtime — use spawn_blocking for CPU-intensive work
|
||||
- All public types must be Send + Sync
|
||||
- Use tokio::select! for timeout handling on extraction operations
|
||||
- Cross-platform: test on Linux (amd64, arm64) and macOS at minimum
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Cache keys: content-hash based (hash of file bytes + config), not path-based
|
||||
- Invalidate cache when extraction config changes (output format, OCR settings, etc.)
|
||||
- Check cache before any extraction — cache hits should skip all processing
|
||||
- Concurrent batch processing: use configurable worker pool, default to CPU count
|
||||
- Stream large files instead of loading into memory — use AsyncRead where possible
|
||||
- Monitor cache hit rates — target >80% for repeated extractions
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- 95% test coverage on core extraction code, 80% on bindings
|
||||
- Test all format categories: text, office, PDF, images, archives, markup
|
||||
- Test corrupted/malformed documents — extraction must fail gracefully, never panic
|
||||
- Benchmark extraction speeds per format — track regressions in CI
|
||||
- Test both success and error paths for every extractor
|
||||
- Use property-based testing for parsers with wide input ranges
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- Always use `SecurityLimits` to cap archive size, compression ratio, file count, and nesting depth for user content. Use `ZipBombValidator` for archive extraction.
|
||||
- Validate MIME type before extraction — never trust file extensions alone
|
||||
- Implement fallback chains: if primary extractor fails, try next-priority extractor
|
||||
- Preserve partial results on failure — return what was extracted with error context
|
||||
- All errors must include: operation name, input description, root cause, and suggestion
|
||||
- Never expose internal file paths or system details in error messages returned to users
|
||||
13
.ai-rulez/domains/ocr-integration/DOMAIN.md
Normal file
13
.ai-rulez/domains/ocr-integration/DOMAIN.md
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
description: OCR backend integration and image processing
|
||||
---
|
||||
|
||||
- Multiple backends: Tesseract (C FFI via leptonica/tesseract-sys), PaddleOCR (ONNX Runtime), Python backends (EasyOCR, Surya) via FFI
|
||||
- Backend selection: priority-based with fallback — Tesseract default, PaddleOCR for CJK, Python backends as fallback
|
||||
- Image preprocessing: deskew, binarization, noise removal, contrast enhancement — applied before OCR
|
||||
- PSM modes: configurable page segmentation (single block, single line, sparse text) per use case
|
||||
- Table detection: identify table regions → cell extraction → row/column reconstruction → Markdown table output
|
||||
- hOCR: parse Tesseract hOCR output for word-level bounding boxes, confidence scores, reading order
|
||||
- Language management: auto-detect document language, load appropriate Tesseract traineddata, support multi-language documents
|
||||
- Caching: cache OCR results by image hash + backend + language + PSM mode
|
||||
- Confidence tracking: per-word and per-page confidence scores, flag low-confidence regions for review
|
||||
16
.ai-rulez/domains/ocr-integration/agents/ocr-engineer.md
Normal file
16
.ai-rulez/domains/ocr-integration/agents/ocr-engineer.md
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
name: ocr-engineer
|
||||
description: OCR pipeline development, backend integration, and table reconstruction
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on OCR code:
|
||||
|
||||
1. Key source paths: crates/kreuzberg/src/ocr/ (processor.rs, tesseract_backend.rs, hocr.rs, cache.rs, language_registry.rs, table/)
|
||||
2. The OCR pipeline: Image Detection -> Preprocessing (denoise, deskew, binarize) -> Backend Selection -> OCR Execution -> hOCR Parsing -> Table Reconstruction -> Caching -> Return
|
||||
3. Backends: Tesseract (default, native C FFI via leptess), PaddleOCR (ONNX via ort), EasyOCR (Python via PyO3)
|
||||
4. For Python backends: use tokio::task::spawn_blocking, minimize GIL hold time with py.allow_threads(), cache Python data in Rust fields
|
||||
5. For table detection: detect via line/cell boundary detection, validate grid structure, OCR each cell, output as markdown
|
||||
6. For language management: validate against LanguageRegistry, check tessdata availability
|
||||
7. Cache OCR results with key = hash(image_bytes + language + config)
|
||||
8. hOCR parsing: use the hocr module to extract word-level bounding boxes and confidence scores
|
||||
@@ -0,0 +1,11 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- Pluggable backend architecture: all backends implement the OcrBackend trait
|
||||
- Backend independence: switching backends must not require API changes
|
||||
- Tesseract is the default backend (native C FFI via leptess)
|
||||
- Python backends (EasyOCR, PaddleOCR): use tokio::task::spawn_blocking, release GIL for Rust work
|
||||
- Graceful degradation: if preferred backend unavailable, fall back to next available
|
||||
- All backends must return structured results with confidence scores
|
||||
- Document installation requirements and troubleshooting for each backend
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
priority: medium
|
||||
---
|
||||
|
||||
- Validate language packs exist before OCR execution — fail fast with helpful message
|
||||
- Support ISO 639 language codes, map to backend-specific formats
|
||||
- Configuration cascade: CLI args > environment > config file > defaults
|
||||
- Provide troubleshooting guides for common issues (missing tessdata, backend not found)
|
||||
- Language pack installation: document per-platform instructions
|
||||
10
.ai-rulez/domains/ocr-integration/rules/ocr-performance.md
Normal file
10
.ai-rulez/domains/ocr-integration/rules/ocr-performance.md
Normal file
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Cache OCR results: key = hash(image_bytes + language + config)
|
||||
- Invalidate cache when OCR config changes (backend, language, PSM mode)
|
||||
- Batch processing: process multiple images concurrently with configurable parallelism
|
||||
- Resource management: limit concurrent OCR operations to avoid memory exhaustion
|
||||
- Performance targets: <2s for single page, <10s for 10-page document
|
||||
- Monitor and log OCR processing times for regression detection
|
||||
10
.ai-rulez/domains/ocr-integration/rules/ocr-quality.md
Normal file
10
.ai-rulez/domains/ocr-integration/rules/ocr-quality.md
Normal file
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Track confidence scores on all OCR results — expose in API
|
||||
- Image preprocessing (denoise, deskew, binarize) should improve accuracy by 10-30%
|
||||
- PSM mode selection: auto-detect layout, allow user override (single block, single line, sparse text, etc.)
|
||||
- Language detection: validate requested languages are available, provide install hints if not
|
||||
- Multi-language support: allow multiple languages per OCR request
|
||||
- Test OCR accuracy against ground-truth documents in CI
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- hOCR parsing: extract word-level bounding boxes, confidence scores, and text content
|
||||
- Preserve spatial relationships from hOCR output for layout reconstruction
|
||||
- Table detection: use cell boundary detection (line detection + intersection analysis)
|
||||
- Validate grid structure before treating detected regions as tables
|
||||
- OCR each cell individually for better accuracy
|
||||
- Convert tables to markdown format with proper column alignment
|
||||
13
.ai-rulez/domains/plugin-system/DOMAIN.md
Normal file
13
.ai-rulez/domains/plugin-system/DOMAIN.md
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
description: Plugin trait system and Python FFI integration
|
||||
---
|
||||
|
||||
- Core traits: Extractor, PostProcessor, MetadataExtractor — each with async extract/process methods returning Result
|
||||
- Discovery: static registration (Rust plugins compiled in) + dynamic discovery (Python plugins via PyO3 FFI)
|
||||
- Priority selection: plugins declare priority per MIME type, registry selects highest-priority match, fallback to next
|
||||
- Registry: PluginRegistry holds all discovered plugins, provides lookup by MIME type, supports hot-reload for Python plugins
|
||||
- Python FFI: Python plugins implement a Python class matching the trait interface, called via PyO3 with GIL management
|
||||
- GIL management: acquire GIL only for Python calls, release immediately after, use py.allow_threads() for Rust-side work
|
||||
- Plugin lifecycle: init → register → validate → ready. Plugins validate their dependencies (e.g., Tesseract binary, Python packages) at startup
|
||||
- Error handling: plugin errors are wrapped in PluginError with source plugin name, converted to ExtractionError at boundary
|
||||
- Testing: test plugins with real files (not mocks), test fallback chains, test Python plugin loading/unloading
|
||||
16
.ai-rulez/domains/plugin-system/agents/plugin-engineer.md
Normal file
16
.ai-rulez/domains/plugin-system/agents/plugin-engineer.md
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
name: plugin-engineer
|
||||
description: Plugin system architecture, registry management, and Python FFI
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on the plugin system:
|
||||
|
||||
1. Key source paths: crates/kreuzberg/src/plugins/ (mod.rs, extractor.rs, ocr.rs, postprocessor.rs, validator.rs, registry.rs), crates/kreuzberg-py/src/plugins.rs
|
||||
2. Plugin types: DocumentExtractor, OcrBackend, PostProcessor, Validator — all extend base Plugin trait (Send + Sync required)
|
||||
3. Priority system: 0-255, default 50, custom override > 50, fallback < 50. Registry selects highest priority for MIME type.
|
||||
4. Registries use Arc<RwLock<>> with MIME type indexing for O(log n) lookup
|
||||
5. Python plugins: validate protocol compliance, use py.allow_threads() for expensive Rust ops, tokio::task::spawn_blocking for async calls
|
||||
6. For new plugin types: define trait extending Plugin, create typed registry, add registration functions, implement priority-based selection
|
||||
7. GIL optimization: cache frequently-accessed Python data in Rust fields, measure GIL overhead
|
||||
8. All plugins must handle errors gracefully — return Result, never panic
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: medium
|
||||
---
|
||||
|
||||
- API stability: plugin interfaces are versioned, breaking changes require major version bump
|
||||
- Plugin discovery: support both static (compile-time) and dynamic (runtime) registration
|
||||
- Plugin validation: check capabilities, supported formats, and version compatibility before registration
|
||||
- Plugin chaining: post-processors can be composed in sequence
|
||||
- Configuration: plugins accept typed configuration, validated at registration time
|
||||
- Documentation: every plugin type must have a development guide with examples
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- All plugins must implement the base Plugin trait: Send + Sync + 'static required
|
||||
- Plugin types: DocumentExtractor, OcrBackend, PostProcessor, Validator
|
||||
- Async execution: use async trait methods for non-blocking operations
|
||||
- Lifecycle: init() -> process() -> cleanup(). Init must validate all requirements.
|
||||
- Never panic in plugin code — all errors must be returned as Result
|
||||
- Consistent result format: all extractors return ExtractionResult with text, metadata, and confidence
|
||||
@@ -0,0 +1,12 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- Separate typed registry per plugin type (ExtractorRegistry, OcrRegistry, etc.)
|
||||
- Thread safety: Arc<RwLock<>> for all registries
|
||||
- Priority system: 0-255, default 50, custom > 50, fallback < 50
|
||||
- Selection: highest priority plugin matching the MIME type wins
|
||||
- MIME type indexing for O(log n) lookup
|
||||
- Conflict resolution: if equal priority, prefer Rust-native over FFI plugins
|
||||
- Dynamic registration: plugins can be added/removed at runtime
|
||||
- Validate plugin before registration (check trait compliance, supported formats)
|
||||
10
.ai-rulez/domains/plugin-system/rules/plugin-testing.md
Normal file
10
.ai-rulez/domains/plugin-system/rules/plugin-testing.md
Normal file
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Mock plugin testing: create test doubles for unit tests
|
||||
- Real plugin testing: integration tests with actual backends
|
||||
- Thread safety tests: run concurrent plugin operations to detect race conditions
|
||||
- Performance baselines: measure and track plugin overhead vs direct calls
|
||||
- Test all error paths: invalid input, backend failure, timeout, resource exhaustion
|
||||
- Test plugin lifecycle: register, use, unregister, verify cleanup
|
||||
11
.ai-rulez/domains/plugin-system/rules/python-ffi-plugins.md
Normal file
11
.ai-rulez/domains/plugin-system/rules/python-ffi-plugins.md
Normal file
@@ -0,0 +1,11 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- GIL management: use py.allow_threads() for expensive Rust operations
|
||||
- Cache frequently-accessed Python data in Rust fields to minimize GIL acquisitions
|
||||
- Use tokio::task::spawn_blocking for async calls to Python backends
|
||||
- Python exception translation: convert Python exceptions to Rust errors with full context
|
||||
- Data type mapping: Python str <-> Rust String, Python bytes <-> Rust Vec<u8>, Python dict <-> Rust HashMap
|
||||
- Validate Python plugin protocol compliance on registration
|
||||
- Target GIL overhead: 5-55us per acquisition
|
||||
45
.ai-rulez/ground-truth-generation.md
Normal file
45
.ai-rulez/ground-truth-generation.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Ground Truth Generation
|
||||
|
||||
## Pandoc Commands
|
||||
|
||||
```bash
|
||||
pandoc <source_file> -t gfm --wrap=none -o <gt_file.md>
|
||||
pandoc <source_file> -t plain --wrap=none -o <gt_file.txt>
|
||||
```
|
||||
|
||||
## Artifact Removal
|
||||
|
||||
```bash
|
||||
sed -i '' 's/ {#[^}]*}//g' "$file" # Remove {#id} attributes
|
||||
sed -i '' 's/ {[^}]*}//g' "$file" # Remove {.class} attributes
|
||||
sed -i '' '/^:::/d' "$file" # Remove fenced div markers
|
||||
sed -i '' 's/\\\$/$/g' "$file" # Unescape dollar signs
|
||||
sed -i '' "s/\\\\'/'/g" "$file" # Unescape quotes
|
||||
```
|
||||
|
||||
## Cleanup Rules
|
||||
|
||||
1. Convert ALL HTML to markdown equivalents where possible
|
||||
2. For colspan/rowspan, put content in first cell, leave others empty
|
||||
3. Remove `<!-- -->` comments
|
||||
4. Strip `<u>`, `<sup>`, `<sub>` tags (keep text content)
|
||||
5. Convert `<img>` to ``
|
||||
6. Collapse 3+ consecutive blank lines to 2
|
||||
7. Never use our own extractor output as GT
|
||||
|
||||
## Fixture JSON Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"document": "relative/path/to/source.ext",
|
||||
"file_type": "docx",
|
||||
"file_size": 12345,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": { "description": "...", "source": "pandoc-generated" },
|
||||
"ground_truth": {
|
||||
"text_file": "relative/path/to/gt.txt",
|
||||
"markdown_file": "relative/path/to/gt.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
```
|
||||
34
.ai-rulez/rules/alef-generated-bindings.md
Normal file
34
.ai-rulez/rules/alef-generated-bindings.md
Normal file
@@ -0,0 +1,34 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Alef-Generated Bindings
|
||||
|
||||
Files in `packages/*/` and binding crates are generated by Alef — DO NOT manually edit.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Check `alef.toml` before editing anything in `packages/*/` or binding crates
|
||||
2. Modify Rust source files or `alef.toml` itself
|
||||
3. Run `task alef:generate` to regenerate all Alef-managed output without formatting (`alef all --clean --format=false`)
|
||||
4. Run `task alef:format` explicitly if Alef post-generation formatting is needed
|
||||
5. Run `task e2e:test` or `task e2e:all` to verify
|
||||
6. Commit Rust source + `alef.toml` + regenerated bindings atomically
|
||||
|
||||
## Rename Mappings (from `alef.toml`)
|
||||
|
||||
- **Go**: `DocumentExtractor` → `Extractor`
|
||||
- All other languages: no renames
|
||||
|
||||
## Freshness Check
|
||||
|
||||
`task alef:generate && git diff --exit-code packages/ crates/kreuzberg-node/ crates/kreuzberg-wasm/ crates/kreuzberg-ffi/`
|
||||
|
||||
## Key `alef.toml` Sections
|
||||
|
||||
- `[crate]` — Rust source files parsed for type/function extraction
|
||||
- `[languages.*]` — per-language output path, module name, rename mappings
|
||||
- `[e2e]` — e2e test generation: fixtures dir, output dir, per-language call overrides
|
||||
- `[readme]` — README template generation per language package
|
||||
|
||||
Canonical e2e tasks are `task e2e:generate`, `task e2e:build`, `task e2e:test`, and `task e2e:all`. Do not add legacy aliases.
|
||||
65
.ai-rulez/rules/feature-flag-policy.md
Normal file
65
.ai-rulez/rules/feature-flag-policy.md
Normal file
@@ -0,0 +1,65 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
# Feature Flag Policy
|
||||
|
||||
All features in `crates/kreuzberg/Cargo.toml`.
|
||||
|
||||
## ORT-Incompatible Targets (WASM, Android x86_64 emulator)
|
||||
|
||||
Only ORT-dependent paths are incompatible. The same paths block both WASM (no native ORT linkage at all) and the `x86_64-linux-android` emulator triple (no pyke prebuilt; `aarch64-linux-android` does ship a prebuilt and gets full ORT):
|
||||
|
||||
- `paddle-ocr` — ONNX Runtime + native C++ deps: not WASM-safe; no Android x86_64 prebuilt
|
||||
- `layout-detection` — depends on ONNX Runtime layout models: not WASM-safe; no Android x86_64 prebuilt
|
||||
- `embeddings` — depends on ONNX Runtime sentence-transformer models: not WASM-safe; no Android x86_64 prebuilt
|
||||
- `auto-rotate` — depends on ONNX Runtime orientation classifier: not WASM-safe; no Android x86_64 prebuilt
|
||||
|
||||
Pure-Rust **type-only** companion features expose the public config/result types for the above without pulling in ORT:
|
||||
|
||||
- `layout-types` — `LayoutDetectionConfig`, `TableModel`, `BBox`, `DetectionResult`, `LayoutClass`, `LayoutDetection`, `RecognizedTable`. `layout-detection` implies `layout-types`.
|
||||
- `auto-rotate-types` — `OrientationResult`. `auto-rotate` implies `auto-rotate-types`.
|
||||
- `embedding-presets` — `EmbeddingPreset` (already existed; pure-Rust preset metadata).
|
||||
|
||||
WASM/Android-safe variants:
|
||||
|
||||
- `ocr` (native) → `ocr-wasm` (uses `tesseract-wasm` + safe image deps) — Android keeps native `ocr`
|
||||
- `excel` (native) → `excel-wasm` (drops `tokio-runtime`) — Android keeps native `excel`
|
||||
- `tree-sitter` (native dlopen) → `tree-sitter-wasm` (statically-linked grammar pack) — Android keeps native `tree-sitter`
|
||||
- `liter-llm` — works on WASM via the upstream `wasm-http` feature; included in `no-ort-target`
|
||||
- `stopwords` — pure-Rust, included in `no-ort-target`
|
||||
- `keywords` — pure-Rust YAKE/RAKE, included in `no-ort-target`
|
||||
|
||||
The `no-ort-target` aggregate is the shared no-ORT base used by both `wasm-target` and `android-target`. `wasm-target = no-ort-target + excel-wasm + tree-sitter-wasm + ocr-wasm`. `android-target = no-ort-target + excel + tree-sitter + ocr + api + mcp`.
|
||||
|
||||
## Experimental (NOT in `full`)
|
||||
|
||||
- `pdf-oxide` — pure-Rust PDF text extraction; opt-in only, excluded from both `full` and `formats`
|
||||
|
||||
## ORT Variants (Mutually Exclusive)
|
||||
|
||||
- `ort-bundled` — downloads official Microsoft ORT binaries; default when OCR/ML features active
|
||||
- `ort-dynamic` — load ORT from system; only when system ORT is guaranteed present
|
||||
|
||||
## Platform-Conditional
|
||||
|
||||
- `kreuzberg-paddle-ocr`, `hf-hub`, `pprof` — excluded on `wasm32`
|
||||
- `ureq`: `rustls` on non-Windows; `native-tls` on Windows
|
||||
- `kreuzberg-ffi` and `kreuzberg-dart` cargo dependencies are target-conditional: `cfg(all(target_os = "android", target_arch = "x86_64"))` selects `android-target`; all other targets (including arm64 Android phones) get the full ORT-enabled feature set.
|
||||
|
||||
## Aggregate Sets
|
||||
|
||||
| Feature | Description |
|
||||
| ---------------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `formats` | All document formats + api/mcp/otel/chunking; no OCR, no ML |
|
||||
| `full` | `formats` + ocr + paddle-ocr + layout + embeddings + tree-sitter + liter-llm; excludes `pdf-oxide` |
|
||||
| `no-ort-target` | Pure-Rust base: every capability that does not depend on ONNX Runtime |
|
||||
| `wasm-target` | `no-ort-target` + excel-wasm + tree-sitter-wasm + ocr-wasm |
|
||||
| `android-target` | `no-ort-target` + excel + tree-sitter + ocr + api + mcp (for x86_64-linux-android emulator) |
|
||||
|
||||
## Build Profiles
|
||||
|
||||
- `release` — LTO thin, codegen-units=1, strip
|
||||
- `profiling` — inherits release, retains debug info
|
||||
- `kreuzberg-wasm` override: `opt-level="z"` (size-optimized)
|
||||
- `sevenz-rust2`, `zip` override: `opt-level=2` (prevents SIGBUS on macOS ARM64)
|
||||
15
.ai-rulez/rules/typescript-conventions.md
Normal file
15
.ai-rulez/rules/typescript-conventions.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- `strict: true` + `noUncheckedIndexedAccess` in tsconfig, never `any` — use `unknown` with type guards.
|
||||
- ESM imports only, `const` over `let`, `as const` for literals, `interface` over `type` for objects.
|
||||
- `import type` for type-only imports to avoid runtime overhead. Discriminated unions for type-safe state.
|
||||
- Formatting/linting: `oxfmt` + `oxlint`. Type checking: `tsc --noEmit` in CI.
|
||||
- Testing: `vitest` (80%+ coverage). Runtime validation at system boundaries with `zod`.
|
||||
- Error handling: discriminated unions for expected errors, throw only for unexpected.
|
||||
- Package manager: `pnpm` with `pnpm-lock.yaml` committed, build: `tsup` or `esbuild`.
|
||||
- Monorepo: workspace protocol (`workspace:*`), shared tsconfig base, `pnpm-workspace.yaml`.
|
||||
- Node.js: `node:` prefix for core modules, `fetch` over `axios`.
|
||||
- Security: `pnpm audit` for dependency CVE scanning. Zero tolerance for critical/high vulnerabilities.
|
||||
- Anti-patterns: non-null assertions (`!`), type assertions (`as`), `enum` (use unions), `@ts-ignore`.
|
||||
212
.ai-rulez/skills/api-server-mcp/SKILL.md
Normal file
212
.ai-rulez/skills/api-server-mcp/SKILL.md
Normal file
@@ -0,0 +1,212 @@
|
||||
---
|
||||
description: "REST API server and MCP protocol integration"
|
||||
name: api-server-mcp
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# API Server & MCP Protocol
|
||||
|
||||
**Axum server design for document extraction endpoints, middleware, async processing, and Model Context Protocol integration for AI agents**
|
||||
|
||||
## Kreuzberg API Architecture
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/`, `crates/kreuzberg-cli/`
|
||||
|
||||
Kreuzberg provides a dual REST API + MCP server built with Axum + Tokio.
|
||||
|
||||
```text
|
||||
Request Flow:
|
||||
HTTP Client / AI Agent (Claude)
|
||||
|
|
||||
[Transport Layer]
|
||||
├── REST API (Axum HTTP)
|
||||
└── MCP Protocol (HTTP or Stdio)
|
||||
|
|
||||
[Middleware Layer]
|
||||
├── CORS, Request Logging (TraceLayer)
|
||||
├── Request/Response size limits
|
||||
└── Rate limiting (optional)
|
||||
|
|
||||
[Router]
|
||||
├── REST Endpoints
|
||||
│ ├── POST /extract - File upload extraction
|
||||
│ ├── POST /extract-url - URL-based extraction
|
||||
│ ├── GET /formats - List supported formats
|
||||
│ ├── GET /health - Server health check
|
||||
│ ├── POST /batch - Batch document processing
|
||||
│ ├── GET /cache/stats - Cache statistics
|
||||
│ └── DELETE /cache - Clear extraction cache
|
||||
├── MCP Endpoints
|
||||
│ ├── POST /mcp/tools - List available tools
|
||||
│ ├── POST /mcp/tools/call - Call a tool
|
||||
│ ├── GET /mcp/resources - List resources
|
||||
│ ├── GET /mcp/resources/:uri - Read resource
|
||||
│ ├── GET /mcp/prompts - List prompts
|
||||
│ └── GET /mcp/prompts/:name - Get prompt
|
||||
|
|
||||
[Handler / Tool Layer]
|
||||
├── extract_handler / extract_file tool
|
||||
├── batch_handler / batch_extract tool
|
||||
├── health_handler / get_capabilities tool
|
||||
└── format_handler
|
||||
|
|
||||
[Extraction Core]
|
||||
├── Format detection
|
||||
├── Extraction pipeline
|
||||
├── Post-processing (chunking, embeddings)
|
||||
└── Result formatting
|
||||
|
|
||||
JSON Response / MCP ToolResult
|
||||
```
|
||||
|
||||
## Server Setup & Configuration
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/server.rs`
|
||||
|
||||
Server initialization pattern: Create `ApiState` (holds `ExtractionConfig` + `ExtractionCache`), build Axum `Router` with all REST + MCP routes, apply middleware layers (body limits, CORS, tracing), serve via `tokio::net::TcpListener`.
|
||||
|
||||
Key middleware layers applied in order:
|
||||
|
||||
- `DefaultBodyLimit::max(100MB)` + `RequestBodyLimitLayer` -- configurable via env vars
|
||||
- `CorsLayer::permissive()` -- restrict in production via `CORS_ALLOWED_ORIGINS`
|
||||
- `TraceLayer::new_for_http()` -- request/response logging
|
||||
|
||||
## Core REST Handlers
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/handlers.rs`
|
||||
|
||||
| Handler | Method | Description |
|
||||
| --------------------- | ----------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `extract_handler` | POST /extract | Multipart upload: parse file + optional config JSON, check cache, call `extract_bytes()`, cache result |
|
||||
| `extract_url_handler` | POST /extract-url | Fetch URL via reqwest, extract bytes |
|
||||
| `batch_handler` | POST /batch | Parallel extraction with `Semaphore`-limited concurrency (default: CPU count) |
|
||||
| `health_handler` | GET /health | Report status, version, uptime, feature availability (OCR, embeddings), cache stats |
|
||||
| `formats_handler` | GET /formats | Return supported format categories (office, pdf, images, web, email, archives, academic) |
|
||||
| `cache_stats_handler` | GET /cache/stats | Hit/miss counts and hit rate |
|
||||
| `cache_clear_handler` | DELETE /cache | Clear LRU cache |
|
||||
|
||||
## Caching Strategy
|
||||
|
||||
**Location**: `crates/kreuzberg/src/cache/mod.rs`
|
||||
|
||||
LRU cache keyed by `SHA256(file_content)`, stores `Arc<ExtractionResult>`. Default 1000 entries. Thread-safe via `RwLock`. Tracks hit/miss counters with `AtomicU64` for stats endpoint.
|
||||
|
||||
## Error Handling
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/error.rs`
|
||||
|
||||
`ApiError` enum maps to HTTP status codes:
|
||||
|
||||
- `MissingFile` -> 400, `FileNotFound` -> 404
|
||||
- `OnnxRuntimeMissing` / `TesseractMissing` -> 503 (with remediation message)
|
||||
- `PayloadTooLarge` -> 413
|
||||
- `ExtractionFailed` / `InvalidConfig` / `UnsupportedFormat` -> 500
|
||||
|
||||
## MCP Server Implementation
|
||||
|
||||
**Location**: `crates/kreuzberg/src/mcp/server.rs`
|
||||
|
||||
The MCP server allows Claude and other AI agents to call Kreuzberg extraction functions through the Model Context Protocol.
|
||||
|
||||
### MCP Tools (Callable Functions)
|
||||
|
||||
Three tools are registered:
|
||||
|
||||
| Tool | Purpose | Required Params |
|
||||
| ------------------ | --------------------------------------------------------- | --------------- |
|
||||
| `extract_file` | Extract text/tables/metadata from documents (75+ formats) | `file_path` |
|
||||
| `batch_extract` | Extract from multiple documents in parallel | `file_paths[]` |
|
||||
| `get_capabilities` | List supported formats, features, backends | (none) |
|
||||
|
||||
**Tool registration pattern** (example: `extract_file`):
|
||||
|
||||
```rust
|
||||
// Define Tool with name, description, JSON Schema inputSchema
|
||||
// Register with server.register_tool(tool, handler_fn)
|
||||
// Handler: parse params -> build ExtractionConfig -> call extract_file() -> return ToolResult as JSON
|
||||
```
|
||||
|
||||
`extract_file` optional params: `format`, `extract_tables`, `extract_images`, `ocr_enabled`, `extract_metadata`, `chunking_preset`, `generate_embeddings`.
|
||||
|
||||
### MCP Resources (Static Knowledge)
|
||||
|
||||
Three resources provide static information to agents:
|
||||
|
||||
- `kreuzberg://formats` -- Supported format list as JSON
|
||||
- `kreuzberg://features` -- Cross-binding feature matrix (from `FEATURE_MATRIX.md`)
|
||||
- `kreuzberg://api-reference` -- Generated API documentation
|
||||
|
||||
### MCP Prompts (Agent Templates)
|
||||
|
||||
Two prompts guide agent extraction workflows:
|
||||
|
||||
- `extract_for_rag` -- Document type-specific RAG extraction guidance (research paper, contract, report). Recommends chunking preset and embedding config.
|
||||
- `batch_document_processing` -- Optimal concurrency, grouping, and error handling for batch workflows.
|
||||
|
||||
### MCP Transport Protocols
|
||||
|
||||
- **HTTP/REST**: MCP routes mounted alongside REST API on separate `/mcp/` prefix
|
||||
- **Stdio**: JSON-RPC 2.0 over stdin/stdout for local CLI integration (e.g., Claude Desktop)
|
||||
|
||||
### Integration with Claude Desktop
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"kreuzberg": {
|
||||
"command": "kreuzberg-mcp",
|
||||
"env": {
|
||||
"KREUZBERG_API_BASE": "http://localhost:8000",
|
||||
"KREUZBERG_MCP_TRANSPORT": "stdio"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### MCP Error Handling
|
||||
|
||||
`ToolError` variants: `FileNotFound`, `UnsupportedFormat`, `ExtractionFailed`, `OnnxRuntimeMissing`, `TesseractMissing`, `Timeout`. Each maps to an MCP `ToolResultError` with descriptive code and message.
|
||||
|
||||
## Environment Configuration
|
||||
|
||||
See `.env.example` for all configurable variables. Key categories:
|
||||
|
||||
- **Server**: `KREUZBERG_HOST`, `KREUZBERG_PORT`
|
||||
- **Size limits**: `KREUZBERG_MAX_REQUEST_BODY_BYTES` (default 100MB), `KREUZBERG_MAX_MULTIPART_FIELD_BYTES`
|
||||
- **Features**: `KREUZBERG_ENABLE_OCR`, `KREUZBERG_ENABLE_EMBEDDINGS`, `KREUZBERG_ENABLE_KEYWORDS`
|
||||
- **Cache**: `KREUZBERG_CACHE_ENABLED`, `KREUZBERG_CACHE_SIZE`
|
||||
- **CORS**: `CORS_ALLOWED_ORIGINS` (comma-separated)
|
||||
- **MCP**: `KREUZBERG_MCP_HOST`, `KREUZBERG_MCP_PORT`, `KREUZBERG_MCP_TRANSPORT` (stdio/http)
|
||||
- **Logging**: `RUST_LOG=kreuzberg=info,tower_http=debug`
|
||||
|
||||
## Critical Rules
|
||||
|
||||
### REST API Rules
|
||||
|
||||
1. **Always validate multipart file uploads** - Check MIME type, size, magic bytes
|
||||
2. **Timeout long-running extractions** - Set per-handler timeout (5 min default)
|
||||
3. **Stream large files** - Never buffer entire multi-GB file in memory
|
||||
4. **Cache aggressively** - Identical files should return from cache in <1ms
|
||||
5. **Parallel extraction is CPU-bound** - Limit workers to CPU count + 1
|
||||
6. **Error responses must be actionable** - Include error code and remediation suggestion
|
||||
7. **Health checks must verify features** - Report missing dependencies (ONNX, Tesseract)
|
||||
8. **Size limits are configurable** - Allow override via env var for large deployments
|
||||
9. **CORS is permissive by default** - Restrict in production via env var
|
||||
10. **Logging all requests** - Track extraction metrics for observability
|
||||
|
||||
### MCP Rules
|
||||
|
||||
1. **All tools must have timeout** - Prevent hanging on large files (default 5 min)
|
||||
2. **Error responses must be detailed** - Include suggestions for missing dependencies
|
||||
3. **Feature gates must be checked** - Return helpful message if feature unavailable (embeddings, OCR)
|
||||
4. **Resources should be static** - Don't query external services in resource handlers
|
||||
5. **Prompts guide agents** - Provide clear examples and best practices
|
||||
6. **Batch tools must support cancellation** - Allow agent to stop long-running batch operations
|
||||
7. **Logging all tool calls** - Track usage for analytics and debugging
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **extraction-pipeline-patterns** - Core extraction called by handlers and MCP tools
|
||||
- **chunking-embeddings** - Optional chunking/embedding parameters in extraction
|
||||
- **ocr-backend-management** - OCR engine selection and image preprocessing
|
||||
120
.ai-rulez/skills/chunking-embeddings/SKILL.md
Normal file
120
.ai-rulez/skills/chunking-embeddings/SKILL.md
Normal file
@@ -0,0 +1,120 @@
|
||||
---
|
||||
description: "Chunking, embeddings, and RAG pipeline integration"
|
||||
name: chunking-embeddings
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Chunking & Embeddings
|
||||
|
||||
**Text splitting strategies, embedding generation with FastEmbed, RAG pipeline integration**
|
||||
|
||||
## Chunking Architecture Overview
|
||||
|
||||
**Location**: `crates/kreuzberg/src/chunking/`, `crates/kreuzberg/src/embeddings.rs`
|
||||
|
||||
```text
|
||||
Extracted Text
|
||||
|
|
||||
[1. Normalization] -> Clean whitespace, remove control chars
|
||||
|
|
||||
[2. Chunk Strategy Selection] -> Fixed-size, semantic, syntax-aware, recursive
|
||||
|
|
||||
[3. Overlap Management] -> Control context window overlap
|
||||
|
|
||||
[4. Optional Embedding] -> Generate vectors with FastEmbed
|
||||
|
|
||||
Output: Vec<Chunk> with text, vectors, metadata
|
||||
```
|
||||
|
||||
## Chunking Strategies
|
||||
|
||||
**Location**: `crates/kreuzberg/src/chunking/mod.rs`
|
||||
|
||||
| Strategy | Pattern | Best For |
|
||||
| --------------------------------- | ------------------------------------------------------- | ------------------------------------------------------------------ |
|
||||
| **Fixed-Size** | Sliding window with configurable overlap | Uniform chunks for embedding models with fixed token limits |
|
||||
| **Semantic** | Split by sentences, merge/split by similarity threshold | Smart context preservation for LLM consumption and semantic search |
|
||||
| **Syntax-Aware** | Split by paragraph/section/heading/code-block structure | Preserving document structure (sections, code blocks) in RAG |
|
||||
| **Recursive** (LangChain pattern) | Try separators in order: `\n\n`, `\n`, `,` | Best general-purpose chunking; auto-finds optimal split points |
|
||||
|
||||
Key config fields per strategy (see struct definitions in `chunking/mod.rs`):
|
||||
|
||||
- Fixed-Size: `chunk_size`, `overlap`, `trim_whitespace`
|
||||
- Semantic: `target_chunk_size`, `min/max_chunk_size`, `semantic_threshold`, `use_sentence_boundaries`
|
||||
- Syntax-Aware: `chunk_by` (Paragraph/Section/Heading/Sentence/CodeBlock), `max_chunk_size`, `respect_code_blocks`
|
||||
- Recursive: `separators[]`, `chunk_size`, `overlap`
|
||||
|
||||
## Chunking Configuration Presets
|
||||
|
||||
**Location**: `crates/kreuzberg/src/chunking/mod.rs`
|
||||
|
||||
| Preset | Chunk Size | Overlap | Strategy | Use Case |
|
||||
| ------------ | ----------- | ------- | ---------- | ---------------------- |
|
||||
| **Balanced** | 512 tokens | 50 | Semantic | RAG sweet spot |
|
||||
| **Compact** | 256 tokens | 32 | Fixed-Size | Dense vectors |
|
||||
| **Extended** | 1024 tokens | 100 | Recursive | Full context |
|
||||
| **Minimal** | 128 tokens | 16 | (default) | Lightweight embeddings |
|
||||
|
||||
Usage: set `config.chunking.preset = Some("balanced")` in `ExtractionConfig`.
|
||||
|
||||
## Embedding Generation with FastEmbed
|
||||
|
||||
**Location**: `crates/kreuzberg/src/embeddings.rs`
|
||||
|
||||
### Model Selection
|
||||
|
||||
| Model | Dimensions | Notes |
|
||||
| ----------------------------------- | ---------- | -------------------------------- |
|
||||
| `BAAI/bge-small-en-v1.5` (default) | 384 | Fast, excellent for RAG |
|
||||
| `BAAI/bge-small-zh-v1.5` | 384 | Chinese optimized |
|
||||
| `BAAI/bge-base-en-v1.5` | 768 | Better quality, slower |
|
||||
| `jinaai/jina-embeddings-v2-base-en` | 768 | Long context (up to 8192 tokens) |
|
||||
| `Custom(path)` | varies | Custom ONNX model path |
|
||||
|
||||
### Embedding Pattern
|
||||
|
||||
`TextEmbeddingManager` provides singleton-cached models per config. Pattern:
|
||||
|
||||
1. `get_or_init_model()` -- lazy-loads ONNX model (downloads if needed), caches in `Arc<RwLock<HashMap>>`
|
||||
2. `embed_chunks()` -- collects chunk texts, calls `model.embed(texts, batch_size)`, zips results back to `ChunkWithEmbedding`
|
||||
|
||||
Default config: `batch_size=256`, `device=CPU`, `parallel_requests=4`.
|
||||
|
||||
### ONNX Runtime Requirement
|
||||
|
||||
Embeddings require ONNX Runtime. Feature-gated via:
|
||||
|
||||
```toml
|
||||
[features]
|
||||
embeddings = ["dep:fastembed", "dep:ort"]
|
||||
```
|
||||
|
||||
Install: `brew install onnxruntime` (macOS) / `apt install libonnxruntime libonnxruntime-dev` (Linux). Verify: `echo $ORT_DYLIB_PATH`.
|
||||
|
||||
## RAG Integration Pattern
|
||||
|
||||
The full extraction-to-RAG pipeline:
|
||||
|
||||
1. **Extract**: `extract_file(path, config)` -> `ExtractionResult`
|
||||
2. **Chunk**: Apply preset strategy to `result.content` -> `Vec<Chunk>`
|
||||
3. **Embed**: If embedding config present, `TextEmbeddingManager::embed_chunks()` -> `Vec<ChunkWithEmbedding>`
|
||||
4. **Output**: `RagDocument { file_path, metadata, chunks }` ready for vector DB ingestion
|
||||
|
||||
See `ChunkWithEmbedding` struct in `types.rs`: contains `text`, `embedding: Vec<f32>`, `dimensions`, `norm`, `metadata`.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **Chunking is preprocessing** - Always apply before embedding to ensure consistent vector sizes
|
||||
2. **Overlap prevents information loss** - Set overlap to 15-20% of chunk size
|
||||
3. **Embedding models are stateful** - Lazy load and cache to avoid repeated initialization
|
||||
4. **ONNX Runtime is required** - Gracefully degrade if not available (skip embeddings)
|
||||
5. **Batch embedding for performance** - Never embed single chunks; batch 50-1000 chunks
|
||||
6. **Normalize embeddings for search** - Use L2 norm for cosine similarity
|
||||
7. **Cache embedding results** - Don't re-embed identical text chunks
|
||||
8. **Model selection impacts quality** - bge-small (384) for speed, bge-base (768) for quality
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **extraction-pipeline-patterns** - Text extraction preceding chunking
|
||||
- **api-server-mcp** - Endpoint for chunking + embedding operations
|
||||
- **ocr-backend-management** - OCR text quality affects chunking success
|
||||
126
.ai-rulez/skills/extraction-pipeline-patterns/SKILL.md
Normal file
126
.ai-rulez/skills/extraction-pipeline-patterns/SKILL.md
Normal file
@@ -0,0 +1,126 @@
|
||||
---
|
||||
description: "Document extraction pipeline architecture and patterns"
|
||||
name: extraction-pipeline-patterns
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Extraction Pipeline Patterns
|
||||
|
||||
**Kreuzberg's format detection -> extraction -> fallback orchestration for 75+ file formats**
|
||||
|
||||
## Core Pipeline Architecture
|
||||
|
||||
The extraction pipeline (`crates/kreuzberg/src/core/pipeline.rs`, `crates/kreuzberg/src/extraction/`) orchestrates:
|
||||
|
||||
1. **Format Detection** - MIME type inference + extension validation -> select appropriate extractor
|
||||
2. **Intelligent Extraction** - Route to format-specific extractors (PDF, DOCX, Excel, HTML, images, archives, etc.)
|
||||
3. **Fallback Strategies** - Password-protected PDFs, OCR for images, nested archive handling, corrupted file recovery
|
||||
4. **Post-Processing Pipeline** - Validators, quality processing, chunking, custom hooks (see `core/pipeline.rs`)
|
||||
|
||||
## Format Detection Strategy
|
||||
|
||||
**Location**: `crates/kreuzberg/src/core/mime.rs`, `crates/kreuzberg/src/core/formats.rs`
|
||||
|
||||
Pattern: detect via magic bytes, validate extension alignment (prevent spoofing), route to extractor. Multiple extractors for same format -> choose highest confidence/specificity.
|
||||
|
||||
```rust
|
||||
// Pseudocode: core/mime.rs
|
||||
match (magic_bytes(content), extension) {
|
||||
(Some(fmt), Some(ext)) if aligned -> Ok(fmt),
|
||||
(Some(fmt), Some(ext)) if misaligned -> Err(FormatMismatch),
|
||||
(Some(fmt), None) -> Ok(fmt), // magic bytes only
|
||||
(None, Some(ext)) -> Ok(from_extension(ext)),
|
||||
_ -> Err(UnknownFormat),
|
||||
}
|
||||
```
|
||||
|
||||
## Extraction Modules (75 Formats)
|
||||
|
||||
| Category | Extractors | Key Modules |
|
||||
| ------------ | ------------------------------------------------ | ---------------------------------------------------- |
|
||||
| **Office** | DOCX, XLSX, XLSM, XLSB, XLS, PPTX, ODP, ODS | `extraction/{docx,excel,pptx}.rs` |
|
||||
| **PDF** | Standard + encrypted, password attempts | `pdf/` subdirectory (13 files) |
|
||||
| **Images** | PNG, JPG, TIFF, WebP, JP2, SVG (OCR-enabled) | `extraction/image.rs` + `ocr/` |
|
||||
| **Web** | HTML, XHTML, XML, SVG (DOM parsing) | `extraction/html.rs` (67KB - complex table handling) |
|
||||
| **Email** | EML, MSG (headers, body, attachments, threading) | `extraction/email.rs` |
|
||||
| **Archives** | ZIP, TAR, GZ, 7Z (recursive extraction) | `extraction/archive.rs` (31KB) |
|
||||
| **Markdown** | MD, TXT, RST, Org Mode, RTF | `extraction/markdown.rs` |
|
||||
| **Academic** | LaTeX, BibTeX, JATS, Jupyter, DocBook | `extraction/{structured,xml}.rs` |
|
||||
|
||||
## Extraction Dispatcher
|
||||
|
||||
```rust
|
||||
// Pseudocode: extraction/mod.rs
|
||||
let format = detect_format(source.bytes, source.extension);
|
||||
let result = match format {
|
||||
Pdf -> extract_pdf(source, config),
|
||||
Docx -> extract_docx(source, config),
|
||||
Image -> extract_image_with_ocr_fallback(source, config),
|
||||
Archive -> extract_archive_recursive(source, config),
|
||||
_ -> extract_with_plugin(format, source, config),
|
||||
};
|
||||
run_pipeline(result, config) // post-processing always runs
|
||||
```
|
||||
|
||||
## Fallback Strategies
|
||||
|
||||
- **Password-Protected PDFs**: Try primary password -> secondary password list -> return `is_encrypted=true` in metadata on failure
|
||||
- **OCR Fallback**: If image text extraction confidence < threshold, trigger OCR backend; return both results with scores
|
||||
- **Nested Archives**: Recursive extraction with configurable depth limit; flatten or preserve hierarchy
|
||||
- **Corrupted File Recovery**: Stream-based parsing, emit content up to error point, include error location in metadata
|
||||
|
||||
## Configuration Integration
|
||||
|
||||
**Location**: `crates/kreuzberg/src/core/config.rs`, `crates/kreuzberg/src/core/config_validation.rs`
|
||||
|
||||
`ExtractionConfig` holds format-specific configs (`pdf`, `image`, `html`, `office`), fallback orchestration (`fallback`), and post-processing (`postprocessor`, `chunking`, `keywords`). See struct definition in `config.rs`.
|
||||
|
||||
## Plugin System Integration
|
||||
|
||||
**Location**: `crates/kreuzberg/src/plugins/`
|
||||
|
||||
- **CustomExtractor**: Override built-in format extractors
|
||||
- **PostProcessor**: Modify results after extraction (Early/Middle/Late stages)
|
||||
- **Validator**: Fail-fast validation (e.g., minimum text length)
|
||||
- **OCRBackend**: Swap OCR engine
|
||||
|
||||
Plugin registry loaded at startup, cached for zero-cost lookup.
|
||||
|
||||
## Feature Flag Strategy
|
||||
|
||||
**Location**: `Cargo.toml` (workspace), `crates/kreuzberg/Cargo.toml`, `FEATURE_MATRIX.md`
|
||||
|
||||
20+ features across 9 language bindings. Key feature groups:
|
||||
|
||||
| Group | Features | Notes |
|
||||
| -------- | ------------------------------------------------------------------------------------ | --------------------------------- |
|
||||
| OCR | `tesseract` (default), `tesseract-static`, `ocr-minimal` | Mutually exclusive recommendation |
|
||||
| Formats | `pdf`, `pdf-minimal`, `office`, `office-minimal` | |
|
||||
| AI/ML | `embeddings` (requires ONNX), `keywords-yake`, `keywords-rake`, `language-detection` | |
|
||||
| Server | `api` (Axum), `mcp`, `tokio-runtime`, `lite-runtime` | |
|
||||
| Bindings | `python-bindings`, `ruby-bindings`, `php-bindings`, `node-bindings`, `wasm` | |
|
||||
|
||||
Conditional compilation: modules gated with `#[cfg(feature = "...")]`. Runtime `validate_config()` warns if requested feature not compiled in.
|
||||
|
||||
### Feature Flag Critical Rules
|
||||
|
||||
1. **Never mix conflicting features** - e.g., `ocr-minimal` + `tesseract` should error at compile time
|
||||
2. **Always provide feature diagnostics** - Config validation must warn if feature unavailable
|
||||
3. **Default to maximum feature set** - Unless embedded/minimal specifically requested
|
||||
4. **Test all feature combinations** - Matrix testing in CI catches regressions
|
||||
5. **WASM incompatible** with embeddings, keywords, OCR
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **Always use format detection** before routing to extractors (prevent confusion attacks)
|
||||
2. **Stream-based parsing** for PDFs/archives to handle multi-GB files
|
||||
3. **Post-pipeline is mandatory**: All extraction results flow through `run_pipeline()` for validators/hooks
|
||||
4. **Plugin overrides are order-dependent**: Plugins registered first take priority
|
||||
5. **Fallback timeouts**: Set reasonable OCR/archive extraction timeouts (config-driven)
|
||||
6. **Metadata preservation**: Include format detection confidence, extraction method used, any fallbacks applied
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **ocr-backend-management** - OCR engine selection and image preprocessing
|
||||
- **chunking-embeddings** - Post-extraction text splitting with FastEmbed
|
||||
- **api-server-mcp** - Axum endpoint for extraction pipeline exposure and MCP server
|
||||
78
.ai-rulez/skills/format-specific-extraction/SKILL.md
Normal file
78
.ai-rulez/skills/format-specific-extraction/SKILL.md
Normal file
@@ -0,0 +1,78 @@
|
||||
---
|
||||
name: format-specific-extraction
|
||||
description: "Format-specific document extraction workflows"
|
||||
priority: high
|
||||
---
|
||||
|
||||
# Format-Specific Extraction Workflows
|
||||
|
||||
## Office XML (DOCX/PPTX/ODT)
|
||||
|
||||
```text
|
||||
ZIP archive → Security validation → XML parsing → Text + tables + metadata
|
||||
```
|
||||
|
||||
1. `ZipBombValidator::new(limits).validate(&mut archive)?`
|
||||
2. Extract XML files from archive (`word/document.xml`, `ppt/slides/*.xml`, `content.xml`)
|
||||
3. Parse with `quick-xml::Reader` (streaming) + `DepthValidator` + `StringGrowthValidator`
|
||||
4. Extract metadata via `crate::extraction::office_metadata::extract_metadata()`
|
||||
5. See: `extractors/docx.rs`, `extractors/pptx.rs`, `extractors/odt.rs`
|
||||
|
||||
## PDF
|
||||
|
||||
```text
|
||||
Bytes → pdf_oxide → Per-page text + OCR fallback → Tables → Metadata
|
||||
```
|
||||
|
||||
1. `pdf_oxide::PdfDocument::from_bytes(content)?`
|
||||
2. Check if needs OCR: `config.force_ocr || !has_searchable_text()`
|
||||
3. Extract text per page, tables if `config.pages` enabled
|
||||
4. Feature-gated: `#[cfg(feature = "pdf")]`
|
||||
5. See: `extractors/pdf/mod.rs`
|
||||
|
||||
## Archives (ZIP/TAR/7z/GZIP)
|
||||
|
||||
```text
|
||||
Validate → Extract metadata → Extract plaintext files only
|
||||
```
|
||||
|
||||
1. `ZipBombValidator` BEFORE any extraction
|
||||
2. Extract metadata (file list, sizes)
|
||||
3. Extract text content from plaintext files
|
||||
4. Use `build_archive_result()` helper
|
||||
5. See: `extractors/archive.rs`, `extraction/archive/*.rs`
|
||||
|
||||
## Structured Text (JSON/YAML/TOML/XML)
|
||||
|
||||
```text
|
||||
Detect format from MIME → Parse → Pretty-print → Metadata
|
||||
```
|
||||
|
||||
Single `StructuredExtractor` handles multiple MIME types. Parse with format-specific library, pretty-print to text.
|
||||
See: `extractors/structured.rs`
|
||||
|
||||
## Email (EML/MSG)
|
||||
|
||||
```text
|
||||
Parse headers → Extract body (text/html) → Process attachments
|
||||
```
|
||||
|
||||
See: `extraction/email.rs`, `extractors/email.rs`
|
||||
|
||||
## Common Helpers
|
||||
|
||||
| Helper | Location | Purpose |
|
||||
| ------------------------------------- | --------------------------- | ------------------------------ |
|
||||
| `office_metadata::extract_metadata()` | `extraction/office.rs` | Office XML metadata |
|
||||
| `cells_to_markdown()` | `extraction/mod.rs` | Convert cell grid to GFM table |
|
||||
| `build_archive_result()` | `extraction/archive/mod.rs` | Standard archive result |
|
||||
|
||||
## Adding a New Format
|
||||
|
||||
1. Add MIME type to `EXT_TO_MIME` in `core/mime.rs`
|
||||
2. Create extractor implementing `DocumentExtractor` trait
|
||||
3. Set `supported_mime_types()` and `priority()` (default: 50)
|
||||
4. Register in `extractors/mod.rs` → `register_default_extractors()`
|
||||
5. Feature-gate if optional: `#[cfg(feature = "my-format")]`
|
||||
6. Apply security validators for user content
|
||||
7. Add tests with fixture files
|
||||
97
.ai-rulez/skills/plugin-architecture-patterns/SKILL.md
Normal file
97
.ai-rulez/skills/plugin-architecture-patterns/SKILL.md
Normal file
@@ -0,0 +1,97 @@
|
||||
---
|
||||
name: plugin-architecture-patterns
|
||||
description: "Plugin architecture, registration, and trait patterns"
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Plugin Architecture & Registration
|
||||
|
||||
## Plugin Types
|
||||
|
||||
| Type | Trait | Location |
|
||||
| ------------------ | --------------------------- | ---------------------------- |
|
||||
| Document Extractor | `DocumentExtractor: Plugin` | `plugins/extractor/trait.rs` |
|
||||
| OCR Backend | `OcrBackend: Plugin` | `plugins/ocr/trait.rs` |
|
||||
| Post Processor | `PostProcessor: Plugin` | `plugins/processor/trait.rs` |
|
||||
| Validator | `Validator: Plugin` | `plugins/validator/trait.rs` |
|
||||
|
||||
## DocumentExtractor Implementation
|
||||
|
||||
```rust
|
||||
use crate::plugins::{DocumentExtractor, Plugin};
|
||||
use async_trait::async_trait;
|
||||
|
||||
pub struct MyExtractor;
|
||||
|
||||
impl Plugin for MyExtractor {
|
||||
fn name(&self) -> &str { "my-extractor" }
|
||||
fn version(&self) -> String { env!("CARGO_PKG_VERSION").to_string() }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for MyExtractor {
|
||||
async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
|
||||
-> Result<ExtractionResult> { /* ... */ }
|
||||
|
||||
fn supported_mime_types(&self) -> &[&str] { &["application/x-custom"] }
|
||||
fn priority(&self) -> i32 { 50 }
|
||||
|
||||
// WASM support (optional)
|
||||
fn as_sync_extractor(&self) -> Option<&dyn SyncExtractor> { None }
|
||||
}
|
||||
```
|
||||
|
||||
## Priority System
|
||||
|
||||
| Range | Use |
|
||||
| ------ | ------------------------- |
|
||||
| 0-25 | Fallback/low-quality |
|
||||
| 26-49 | Alternative extractors |
|
||||
| **50** | **Default (built-in)** |
|
||||
| 51-75 | Premium/enhanced |
|
||||
| 76-100 | Specialized/high-priority |
|
||||
|
||||
Registry selects **highest priority** extractor for each MIME type. Override built-ins with priority > 50.
|
||||
|
||||
## Registration
|
||||
|
||||
```rust
|
||||
// In extractors/mod.rs → register_default_extractors()
|
||||
let registry = get_document_extractor_registry();
|
||||
let mut registry = registry.write()
|
||||
.map_err(|e| KreuzbergError::Other(format!("Registry lock poisoned: {}", e)))?;
|
||||
registry.register(Arc::new(MyExtractor::new()))?;
|
||||
```
|
||||
|
||||
## Feature-Gated Registration
|
||||
|
||||
```rust
|
||||
#[cfg(feature = "office")]
|
||||
{
|
||||
registry.register(Arc::new(DocxExtractor::new()))?;
|
||||
registry.register(Arc::new(PptxExtractor::new()))?;
|
||||
}
|
||||
```
|
||||
|
||||
## PostProcessor Pattern
|
||||
|
||||
```rust
|
||||
impl PostProcessor for MyProcessor {
|
||||
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
||||
-> Result<()> {
|
||||
result.content = process_content(&result.content);
|
||||
Ok(())
|
||||
}
|
||||
fn stage(&self) -> ProcessorStage { ProcessorStage::Middle }
|
||||
}
|
||||
```
|
||||
|
||||
Stages: `Early` → `Middle` → `Late`. Failures isolated (don't block others).
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. All plugins **MUST be `Send + Sync`**
|
||||
2. Feature gate with `#[cfg(feature = "...")]` for optional formats
|
||||
3. Use `#[async_trait]` for `DocumentExtractor`
|
||||
4. Initialization via `ensure_initialized()` (lazy, called before first extraction)
|
||||
5. Plugin names: kebab-case (e.g., `"pdf-extractor"`)
|
||||
34
.cargo/config.toml
Normal file
34
.cargo/config.toml
Normal file
@@ -0,0 +1,34 @@
|
||||
# This file is auto-generated by alef. DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# Re-generate with: alef scaffold
|
||||
|
||||
[build]
|
||||
incremental = true
|
||||
jobs = 4
|
||||
|
||||
[net]
|
||||
git-fetch-with-cli = true
|
||||
|
||||
[registries.crates-io]
|
||||
protocol = "sparse"
|
||||
|
||||
# Required for PyO3 / ext-php-rs cdylibs: Python and Zend C-API symbols are
|
||||
# resolved at runtime when the host loads the extension, not at link time.
|
||||
# macOS ld is strict and rejects unresolved symbols by default.
|
||||
[target.'cfg(target_os = "macos")']
|
||||
rustflags = ["-C", "link-arg=-Wl,-undefined,dynamic_lookup"]
|
||||
|
||||
[target.x86_64-pc-windows-msvc]
|
||||
linker = "rust-lld"
|
||||
|
||||
[target.i686-pc-windows-msvc]
|
||||
linker = "rust-lld"
|
||||
|
||||
[target.aarch64-unknown-linux-gnu]
|
||||
linker = "aarch64-linux-gnu-gcc"
|
||||
|
||||
[target.x86_64-unknown-linux-musl]
|
||||
linker = "musl-gcc"
|
||||
|
||||
[target.wasm32-unknown-unknown]
|
||||
rustflags = ["-C", "target-feature=+bulk-memory", "--cfg", "getrandom_backend=\"wasm_js\""]
|
||||
8
.clang-format
Normal file
8
.clang-format
Normal file
@@ -0,0 +1,8 @@
|
||||
---
|
||||
BasedOnStyle: LLVM
|
||||
IndentWidth: 4
|
||||
ColumnLimit: 100
|
||||
BreakBeforeBraces: Attach
|
||||
AllowShortFunctionsOnASingleLine: Empty
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
SortIncludes: true
|
||||
32
.dockerignore
Normal file
32
.dockerignore
Normal file
@@ -0,0 +1,32 @@
|
||||
# =============================================================================
|
||||
# .dockerignore — Docker builds only need Cargo.toml, Cargo.lock, and crates/
|
||||
# Exclude everything else to minimize context and speed up builds.
|
||||
# =============================================================================
|
||||
|
||||
# Start by ignoring everything
|
||||
*
|
||||
|
||||
# Allow only what Docker builds need
|
||||
!Cargo.toml
|
||||
!Cargo.lock
|
||||
!crates/kreuzberg/
|
||||
!crates/kreuzberg-cli/
|
||||
!crates/kreuzberg-tesseract/
|
||||
!crates/kreuzberg-paddle-ocr/
|
||||
!crates/kreuzberg-ffi/
|
||||
!crates/kreuzberg-py/
|
||||
!crates/kreuzberg-node/
|
||||
!crates/kreuzberg-wasm/
|
||||
!packages/elixir/native/kreuzberg_nif/
|
||||
!docker/
|
||||
|
||||
# Re-exclude build artifacts inside allowed dirs
|
||||
**/target/
|
||||
**/.git/
|
||||
**/node_modules/
|
||||
**/__pycache__/
|
||||
**/*.pyc
|
||||
**/*.so
|
||||
**/*.dylib
|
||||
**/*.dll
|
||||
**/*.node
|
||||
182
.editorconfig
Normal file
182
.editorconfig
Normal file
@@ -0,0 +1,182 @@
|
||||
# EditorConfig is awesome: https://EditorConfig.org
|
||||
|
||||
# top-most EditorConfig file
|
||||
root = true
|
||||
|
||||
# All files
|
||||
[*]
|
||||
charset = utf-8
|
||||
insert_final_newline = true
|
||||
trim_trailing_whitespace = true
|
||||
end_of_line = lf
|
||||
|
||||
# Code files
|
||||
[*.{cs,go,rs,py,js,ts,tsx,jsx,php,rb}]
|
||||
indent_style = space
|
||||
|
||||
# C# files
|
||||
[*.cs]
|
||||
indent_size = 4
|
||||
|
||||
# Organize usings
|
||||
dotnet_sort_system_directives_first = true
|
||||
dotnet_separate_import_directive_groups = false
|
||||
|
||||
# this. and Me. preferences
|
||||
dotnet_style_qualification_for_field = false:warning
|
||||
dotnet_style_qualification_for_property = false:warning
|
||||
dotnet_style_qualification_for_method = false:warning
|
||||
dotnet_style_qualification_for_event = false:warning
|
||||
|
||||
# Language keywords vs BCL types preferences
|
||||
dotnet_style_predefined_type_for_locals_parameters_members = true:warning
|
||||
dotnet_style_predefined_type_for_member_access = true:warning
|
||||
|
||||
# Parentheses preferences
|
||||
dotnet_style_parentheses_in_arithmetic_binary_operators = always_for_clarity:suggestion
|
||||
dotnet_style_parentheses_in_relational_binary_operators = always_for_clarity:suggestion
|
||||
dotnet_style_parentheses_in_other_binary_operators = always_for_clarity:suggestion
|
||||
dotnet_style_parentheses_in_other_operators = never_if_unnecessary:suggestion
|
||||
|
||||
# Modifier preferences
|
||||
dotnet_style_require_accessibility_modifiers = always:warning
|
||||
dotnet_style_readonly_field = true:warning
|
||||
csharp_preferred_modifier_order = public,private,protected,internal,static,extern,new,virtual,abstract,sealed,override,readonly,unsafe,volatile,async:suggestion
|
||||
|
||||
# Expression-level preferences
|
||||
dotnet_style_object_initializer = true:suggestion
|
||||
dotnet_style_collection_initializer = true:suggestion
|
||||
dotnet_style_explicit_tuple_names = true:warning
|
||||
dotnet_style_prefer_inferred_tuple_names = true:suggestion
|
||||
dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion
|
||||
dotnet_style_prefer_auto_properties = true:suggestion
|
||||
dotnet_style_prefer_conditional_expression_over_assignment = true:silent
|
||||
dotnet_style_prefer_conditional_expression_over_return = true:silent
|
||||
dotnet_style_prefer_compound_assignment = true:suggestion
|
||||
dotnet_style_prefer_simplified_interpolation = true:suggestion
|
||||
dotnet_style_prefer_simplified_boolean_expressions = true:suggestion
|
||||
|
||||
# Null-checking preferences
|
||||
dotnet_style_coalesce_expression = true:warning
|
||||
dotnet_style_null_propagation = true:warning
|
||||
dotnet_style_prefer_is_null_check_over_reference_equality_method = true:warning
|
||||
|
||||
# C# Code Style Rules
|
||||
# var preferences
|
||||
csharp_style_var_for_built_in_types = true:suggestion
|
||||
csharp_style_var_when_type_is_apparent = true:suggestion
|
||||
csharp_style_var_elsewhere = true:suggestion
|
||||
|
||||
# Expression-bodied members
|
||||
csharp_style_expression_bodied_methods = when_on_single_line:suggestion
|
||||
csharp_style_expression_bodied_constructors = false:silent
|
||||
csharp_style_expression_bodied_operators = when_on_single_line:suggestion
|
||||
csharp_style_expression_bodied_properties = when_on_single_line:suggestion
|
||||
csharp_style_expression_bodied_indexers = when_on_single_line:suggestion
|
||||
csharp_style_expression_bodied_accessors = when_on_single_line:suggestion
|
||||
csharp_style_expression_bodied_lambdas = when_on_single_line:suggestion
|
||||
csharp_style_expression_bodied_local_functions = when_on_single_line:suggestion
|
||||
|
||||
# Pattern matching preferences
|
||||
csharp_style_pattern_matching_over_is_with_cast_check = true:warning
|
||||
csharp_style_pattern_matching_over_as_with_null_check = true:warning
|
||||
csharp_style_prefer_switch_expression = true:suggestion
|
||||
csharp_style_prefer_pattern_matching = true:suggestion
|
||||
csharp_style_prefer_not_pattern = true:suggestion
|
||||
|
||||
# Null-checking preferences
|
||||
csharp_style_throw_expression = true:suggestion
|
||||
csharp_style_conditional_delegate_call = true:warning
|
||||
|
||||
# Code block preferences
|
||||
csharp_prefer_braces = true:warning
|
||||
csharp_prefer_simple_using_statement = true:suggestion
|
||||
|
||||
# Expression preferences
|
||||
csharp_prefer_simple_default_expression = true:suggestion
|
||||
csharp_style_pattern_local_over_anonymous_function = true:suggestion
|
||||
csharp_style_inlined_variable_declaration = true:suggestion
|
||||
csharp_style_deconstructed_variable_declaration = true:suggestion
|
||||
csharp_style_prefer_index_operator = true:suggestion
|
||||
csharp_style_prefer_range_operator = true:suggestion
|
||||
csharp_style_implicit_object_creation_when_type_is_apparent = true:suggestion
|
||||
|
||||
# C# Formatting Rules
|
||||
# New line preferences
|
||||
csharp_new_line_before_open_brace = all
|
||||
csharp_new_line_before_else = true
|
||||
csharp_new_line_before_catch = true
|
||||
csharp_new_line_before_finally = true
|
||||
csharp_new_line_before_members_in_object_initializers = true
|
||||
csharp_new_line_before_members_in_anonymous_types = true
|
||||
csharp_new_line_between_query_expression_clauses = true
|
||||
|
||||
# Indentation preferences
|
||||
csharp_indent_case_contents = true
|
||||
csharp_indent_switch_labels = true
|
||||
csharp_indent_labels = no_change
|
||||
csharp_indent_block_contents = true
|
||||
csharp_indent_braces = false
|
||||
csharp_indent_case_contents_when_block = false
|
||||
|
||||
# Space preferences
|
||||
csharp_space_after_cast = false
|
||||
csharp_space_after_keywords_in_control_flow_statements = true
|
||||
csharp_space_between_parentheses = false
|
||||
csharp_space_before_colon_in_inheritance_clause = true
|
||||
csharp_space_after_colon_in_inheritance_clause = true
|
||||
csharp_space_around_binary_operators = before_and_after
|
||||
csharp_space_between_method_declaration_parameter_list_parentheses = false
|
||||
csharp_space_between_method_declaration_empty_parameter_list_parentheses = false
|
||||
csharp_space_between_method_declaration_name_and_open_parenthesis = false
|
||||
csharp_space_between_method_call_parameter_list_parentheses = false
|
||||
csharp_space_between_method_call_empty_parameter_list_parentheses = false
|
||||
csharp_space_between_method_call_name_and_opening_parenthesis = false
|
||||
csharp_space_after_comma = true
|
||||
csharp_space_after_dot = false
|
||||
csharp_space_after_semicolon_in_for_statement = true
|
||||
csharp_space_before_semicolon_in_for_statement = false
|
||||
csharp_space_around_declaration_statements = false
|
||||
csharp_space_before_open_square_brackets = false
|
||||
csharp_space_between_empty_square_brackets = false
|
||||
csharp_space_between_square_brackets = false
|
||||
|
||||
# Wrap preferences
|
||||
csharp_preserve_single_line_statements = false
|
||||
csharp_preserve_single_line_blocks = true
|
||||
|
||||
# Using directive preferences
|
||||
csharp_using_directive_placement = outside_namespace:warning
|
||||
|
||||
# Go files
|
||||
[*.go]
|
||||
indent_style = tab
|
||||
indent_size = 4
|
||||
|
||||
# Rust files
|
||||
[*.rs]
|
||||
indent_size = 4
|
||||
|
||||
# Python files
|
||||
[*.py]
|
||||
indent_size = 4
|
||||
|
||||
# JavaScript/TypeScript files
|
||||
[*.{js,ts,tsx,jsx}]
|
||||
indent_size = 2
|
||||
|
||||
# Ruby files
|
||||
[*.rb]
|
||||
indent_size = 2
|
||||
|
||||
# PHP files
|
||||
[*.php]
|
||||
indent_size = 4
|
||||
|
||||
# YAML files
|
||||
[*.{yml,yaml}]
|
||||
indent_size = 2
|
||||
|
||||
# Markdown files
|
||||
[*.md]
|
||||
trim_trailing_whitespace = false
|
||||
37
.gh-actions-updater.toml
Normal file
37
.gh-actions-updater.toml
Normal file
@@ -0,0 +1,37 @@
|
||||
[scan]
|
||||
include = [
|
||||
".github/workflows/**/*.yml",
|
||||
".github/workflows/**/*.yaml",
|
||||
".github/actions/**/action.yml",
|
||||
".github/actions/**/action.yaml",
|
||||
"action.yml",
|
||||
"action.yaml",
|
||||
]
|
||||
exclude = []
|
||||
recursive = false
|
||||
|
||||
[cache]
|
||||
enabled = true
|
||||
ttl = "6h"
|
||||
|
||||
[update]
|
||||
mode = "latest-tag"
|
||||
# erlef/setup-beam: publisher exposes the major-minor float tag (e.g. v1.24)
|
||||
# but its `dist/index.js` ships only on the patch tag (v1.24.0). The float
|
||||
# resolves to a commit with no bundled JS, so the action exits in <1s with
|
||||
# `failure` and no log message. Pin to patch tags only.
|
||||
exclude = ["erlef/setup-beam"]
|
||||
include_prereleases = false
|
||||
preserve_major = true
|
||||
missing_ref = "warn"
|
||||
|
||||
[output]
|
||||
format = "human"
|
||||
color = "auto"
|
||||
|
||||
[github]
|
||||
api_url = "https://api.github.com"
|
||||
|
||||
[performance]
|
||||
# Omit threads to use Rayon\'s available-core default.
|
||||
# threads = 8
|
||||
20
.gitattributes
vendored
Normal file
20
.gitattributes
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
# Generated by alef scaffold.
|
||||
crates/kreuzberg-ffi/** linguist-generated=true
|
||||
crates/kreuzberg-node/** linguist-generated=true
|
||||
crates/kreuzberg-php/** linguist-generated=true
|
||||
crates/kreuzberg-php/src/** linguist-generated=true
|
||||
crates/kreuzberg-py/** linguist-generated=true
|
||||
crates/kreuzberg-wasm/** linguist-generated=true
|
||||
e2e/** linguist-generated=true
|
||||
packages/csharp/src/** linguist-generated=true
|
||||
packages/dart/** linguist-generated=true
|
||||
packages/elixir/** linguist-generated=true
|
||||
packages/go/v5/** linguist-generated=true
|
||||
packages/java/** linguist-generated=true
|
||||
packages/kotlin-android/** linguist-generated=true
|
||||
packages/python/** linguist-generated=true
|
||||
packages/r/** linguist-generated=true
|
||||
packages/ruby/** linguist-generated=true
|
||||
packages/swift/** linguist-generated=true
|
||||
packages/zig/** linguist-generated=true
|
||||
test_apps/** linguist-generated=true
|
||||
10
.github/CODEOWNERS
vendored
Normal file
10
.github/CODEOWNERS
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
# Default owner — everything
|
||||
* @Goldziher
|
||||
|
||||
# Zensical config and documentation
|
||||
/zensical.toml @Goldziher @pratik-mahalle @v-tan
|
||||
/docs/ @Goldziher @pratik-mahalle @v-tan
|
||||
*.md @Goldziher @pratik-mahalle @v-tan
|
||||
|
||||
# Rust crates
|
||||
/crates/ @Goldziher @kh3rld
|
||||
28
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
name: Bug Report
|
||||
description: Report a bug or unexpected behavior
|
||||
title: "bug: "
|
||||
labels: ["bug"]
|
||||
projects: ["kreuzberg-dev/1"]
|
||||
body:
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Description
|
||||
description: What happened? What did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: steps-to-reproduce
|
||||
attributes:
|
||||
label: Steps to reproduce
|
||||
description: Minimal steps to reproduce the issue.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: reproduction-files
|
||||
attributes:
|
||||
label: Relevant files and configuration
|
||||
description: >-
|
||||
Any configuration files, input files, or code snippets needed to
|
||||
reproduce the issue.
|
||||
render: text
|
||||
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1 @@
|
||||
blank_issues_enabled: true
|
||||
20
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
20
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
name: Documentation Issue
|
||||
description: Report missing, unclear, or incorrect documentation
|
||||
title: "docs: "
|
||||
labels: ["documentation"]
|
||||
projects: ["kreuzberg-dev/1"]
|
||||
body:
|
||||
- type: textarea
|
||||
id: what
|
||||
attributes:
|
||||
label: What
|
||||
description: What documentation is missing, unclear, or incorrect?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: why
|
||||
attributes:
|
||||
label: Why
|
||||
description: Why does this need to change?
|
||||
validations:
|
||||
required: true
|
||||
18
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
18
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
name: Feature Request
|
||||
description: Suggest a new feature or improvement
|
||||
title: "feat: "
|
||||
labels: ["enhancement"]
|
||||
projects: ["kreuzberg-dev/1"]
|
||||
body:
|
||||
- type: textarea
|
||||
id: what
|
||||
attributes:
|
||||
label: What is the proposed feature?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: why
|
||||
attributes:
|
||||
label: Why would this be a good addition?
|
||||
validations:
|
||||
required: true
|
||||
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
## Related
|
||||
|
||||
<!-- Link issues or discussions if applicable -->
|
||||
|
||||
## Description
|
||||
|
||||
<!-- What does this PR do? -->
|
||||
|
||||
## Checklist
|
||||
|
||||
- [ ] CI passing
|
||||
- [ ] Tests added where applicable
|
||||
9
.github/actionlint.yaml
vendored
Normal file
9
.github/actionlint.yaml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
self-hosted-runner:
|
||||
labels:
|
||||
- runner-small
|
||||
- runner-medium
|
||||
- runner-medium-arm64
|
||||
- runner-large
|
||||
- runner-large-spot
|
||||
- runner-medium-arm64-spot
|
||||
- runner-gpu-l4
|
||||
313
.github/actions/cache-benchmark-harness/action.yml
vendored
Normal file
313
.github/actions/cache-benchmark-harness/action.yml
vendored
Normal file
@@ -0,0 +1,313 @@
|
||||
name: Cache Benchmark Harness Binary
|
||||
description: >
|
||||
Build and cache the benchmark-harness binary with intelligent caching based on source hashes.
|
||||
Generates cache keys based on harness source + kreuzberg dependency + Cargo files,
|
||||
restores from cache if available, builds if needed, and saves to cache.
|
||||
Validates artifacts after restore or build to ensure integrity.
|
||||
|
||||
inputs:
|
||||
cache-version:
|
||||
description: "Manual version for cache invalidation"
|
||||
required: false
|
||||
default: "v1"
|
||||
|
||||
build-profile:
|
||||
description: "Build profile (release, debug)"
|
||||
required: false
|
||||
default: "release"
|
||||
|
||||
outputs:
|
||||
cache-hit:
|
||||
description: "Boolean indicating exact cache hit"
|
||||
value: ${{ steps.cache-restore.outputs.cache-hit }}
|
||||
|
||||
cache-key:
|
||||
description: "The cache key used"
|
||||
value: ${{ steps.generate-cache-key.outputs.cache-key }}
|
||||
|
||||
binary-path:
|
||||
description: "Path to the built/cached benchmark-harness binary"
|
||||
value: ${{ steps.validate-binary.outputs.binary-path }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
# Validate inputs
|
||||
- name: Validate inputs
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Validate build profile
|
||||
valid_profiles=("release" "debug")
|
||||
if [[ ! " ${valid_profiles[@]} " =~ " ${BUILD_PROFILE} " ]]; then
|
||||
echo "❌ Error: build-profile must be one of: ${valid_profiles[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Validation passed"
|
||||
echo " Build profile: $BUILD_PROFILE"
|
||||
echo " Cache version: ${{ inputs.cache-version }}"
|
||||
|
||||
# Compute hash for benchmark-harness sources
|
||||
- name: Compute benchmark-harness source hash
|
||||
id: harness-hash
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Computing Benchmark Harness Source Hash ==="
|
||||
|
||||
# Compute hash for harness source files and Cargo.toml
|
||||
HARNESS_HASH=$(scripts/ci/cache/compute-hash.sh \
|
||||
"tools/benchmark-harness/src/**" \
|
||||
"tools/benchmark-harness/Cargo.toml" \
|
||||
2>&1 | grep "^[a-f0-9]*$")
|
||||
|
||||
if [[ -z "$HARNESS_HASH" ]]; then
|
||||
echo "❌ Failed to compute harness source hash"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "harness-hash=$HARNESS_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Harness source hash: $HARNESS_HASH"
|
||||
|
||||
# Compute hash for kreuzberg dependency
|
||||
- name: Compute kreuzberg dependency hash
|
||||
id: kreuzberg-hash
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Computing Kreuzberg Dependency Hash ==="
|
||||
|
||||
# Compute hash for kreuzberg crate (dependency)
|
||||
KREUZBERG_HASH=$(scripts/ci/cache/compute-hash.sh --dirs \
|
||||
"crates/kreuzberg" \
|
||||
2>&1 | grep "^[a-f0-9]*$")
|
||||
|
||||
if [[ -z "$KREUZBERG_HASH" ]]; then
|
||||
echo "❌ Failed to compute kreuzberg dependency hash"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "kreuzberg-hash=$KREUZBERG_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Kreuzberg dependency hash: $KREUZBERG_HASH"
|
||||
|
||||
# Compute hash for Cargo files
|
||||
- name: Compute Cargo files hash
|
||||
id: cargo-hash
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Computing Cargo Files Hash ==="
|
||||
|
||||
# Compute hash for Cargo.lock
|
||||
CARGO_HASH=$(scripts/ci/cache/compute-hash.sh --files Cargo.lock 2>&1 | grep "^[a-f0-9]*$")
|
||||
|
||||
if [[ -z "$CARGO_HASH" ]]; then
|
||||
echo "❌ Failed to compute Cargo files hash"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "cargo-hash=$CARGO_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Cargo files hash: $CARGO_HASH"
|
||||
|
||||
# Generate cache key
|
||||
- name: Generate cache key
|
||||
id: generate-cache-key
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
HARNESS_HASH: ${{ steps.harness-hash.outputs.harness-hash }}
|
||||
KREUZBERG_HASH: ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}
|
||||
CARGO_HASH: ${{ steps.cargo-hash.outputs.cargo-hash }}
|
||||
CACHE_VERSION: ${{ inputs.cache-version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Cache Key Generated ==="
|
||||
|
||||
# Build cache key following format:
|
||||
# harness-{profile}-{platform}-src-{harness-hash}-kreuzberg-{kreuzberg-hash}-cargo-{cargo-hash}-v{version}
|
||||
CACHE_KEY="harness-${BUILD_PROFILE}-$(uname -m)-src-${HARNESS_HASH}-kreuzberg-${KREUZBERG_HASH}-cargo-${CARGO_HASH}-${CACHE_VERSION}"
|
||||
|
||||
echo "cache-key=$CACHE_KEY" >> "$GITHUB_OUTPUT"
|
||||
|
||||
echo "Full key: $CACHE_KEY"
|
||||
echo ""
|
||||
echo "Key components:"
|
||||
echo " Profile: $BUILD_PROFILE"
|
||||
echo " Platform: $(uname -m)"
|
||||
echo " Harness hash: $HARNESS_HASH"
|
||||
echo " Kreuzberg hash: $KREUZBERG_HASH"
|
||||
echo " Cargo hash: $CARGO_HASH"
|
||||
echo " Cache version: $CACHE_VERSION"
|
||||
|
||||
# Determine target path based on profile
|
||||
- name: Determine target paths
|
||||
id: target-paths
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Determining Target Paths ==="
|
||||
|
||||
case "$BUILD_PROFILE" in
|
||||
release)
|
||||
TARGET_DIR="target/release"
|
||||
;;
|
||||
debug)
|
||||
TARGET_DIR="target/debug"
|
||||
;;
|
||||
*)
|
||||
echo "❌ Invalid build profile: $BUILD_PROFILE"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "target-dir=$TARGET_DIR" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Target directory: $TARGET_DIR"
|
||||
|
||||
# Detect architecture for cache keys (shell expansion doesn't work in YAML with: context)
|
||||
- name: Detect architecture
|
||||
id: detect-arch
|
||||
shell: bash
|
||||
run: echo "arch=$(uname -m)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Restore from cache
|
||||
- name: Restore benchmark-harness binary from cache
|
||||
id: cache-restore
|
||||
uses: kreuzberg-dev/actions/cache-binding-artifact@v1
|
||||
with:
|
||||
binding-name: benchmark-harness
|
||||
cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
|
||||
cache-restore-keys: |
|
||||
harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-src-
|
||||
harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-
|
||||
harness-${{ inputs.build-profile }}-
|
||||
cache-paths: |
|
||||
${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
|
||||
operation: restore
|
||||
|
||||
# Log cache hit status
|
||||
- name: Log cache hit status
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [[ "${{ steps.cache-restore.outputs.cache-hit }}" == "true" ]]; then
|
||||
echo "✓ Cache HIT - benchmark-harness binary found in cache"
|
||||
else
|
||||
echo "✗ Cache MISS - Building benchmark-harness from source"
|
||||
fi
|
||||
|
||||
# Build if cache miss
|
||||
- name: Build benchmark-harness
|
||||
id: build
|
||||
if: steps.cache-restore.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Building Benchmark Harness ==="
|
||||
echo "Profile: $BUILD_PROFILE"
|
||||
|
||||
# Determine cargo build profile argument
|
||||
case "$BUILD_PROFILE" in
|
||||
release)
|
||||
BUILD_ARG="--release"
|
||||
;;
|
||||
debug)
|
||||
# Debug is default, no flag needed
|
||||
BUILD_ARG=""
|
||||
;;
|
||||
*)
|
||||
echo "❌ Invalid build profile: $BUILD_PROFILE"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Build benchmark-harness
|
||||
echo "Running: cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG"
|
||||
if ! cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG; then
|
||||
echo "❌ Build failed for benchmark-harness"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Build succeeded"
|
||||
|
||||
# Validate binary exists and is executable
|
||||
- name: Validate benchmark-harness binary
|
||||
id: validate-binary
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
TARGET_DIR: ${{ steps.target-paths.outputs.target-dir }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Validating Benchmark Harness Binary ==="
|
||||
|
||||
BINARY_PATH="${TARGET_DIR}/benchmark-harness"
|
||||
|
||||
# Check if binary exists
|
||||
if [[ ! -f "$BINARY_PATH" ]]; then
|
||||
echo "❌ Binary not found at: $BINARY_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if binary is executable
|
||||
if [[ ! -x "$BINARY_PATH" ]]; then
|
||||
echo "❌ Binary is not executable: $BINARY_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get binary size and info
|
||||
BINARY_SIZE=$(ls -lh "$BINARY_PATH" | awk '{print $5}')
|
||||
BINARY_PERMS=$(ls -l "$BINARY_PATH" | awk '{print $1}')
|
||||
|
||||
echo "binary-path=$BINARY_PATH" >> "$GITHUB_OUTPUT"
|
||||
|
||||
echo "✓ Binary validation passed"
|
||||
echo " Path: $BINARY_PATH"
|
||||
echo " Size: $BINARY_SIZE"
|
||||
echo " Permissions: $BINARY_PERMS"
|
||||
|
||||
# Save to cache if build occurred
|
||||
- name: Save benchmark-harness binary to cache
|
||||
if: steps.cache-restore.outputs.cache-hit != 'true'
|
||||
uses: kreuzberg-dev/actions/cache-binding-artifact@v1
|
||||
with:
|
||||
binding-name: benchmark-harness
|
||||
cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
|
||||
cache-paths: |
|
||||
${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
|
||||
operation: save
|
||||
|
||||
# Summary
|
||||
- name: Summary
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo ""
|
||||
echo "=== Build and Cache Summary ==="
|
||||
echo "Build Profile: ${{ inputs.build-profile }}"
|
||||
echo "Platform: $(uname -m)"
|
||||
echo "Cache Hit: ${{ steps.cache-restore.outputs.cache-hit == 'true' && 'Yes' || 'No' }}"
|
||||
echo "Cache Key: ${{ steps.generate-cache-key.outputs.cache-key }}"
|
||||
echo "Binary Path: ${{ steps.validate-binary.outputs.binary-path }}"
|
||||
echo ""
|
||||
echo "Hashes:"
|
||||
echo " Harness: ${{ steps.harness-hash.outputs.harness-hash }}"
|
||||
echo " Kreuzberg: ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}"
|
||||
echo " Cargo: ${{ steps.cargo-hash.outputs.cargo-hash }}"
|
||||
105
.github/actions/install-system-deps/action.yml
vendored
Normal file
105
.github/actions/install-system-deps/action.yml
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
name: Install System Dependencies
|
||||
description: |
|
||||
Install and cache platform-specific dependencies required for document conversion.
|
||||
Includes: Tesseract OCR, fonts, and build tools.
|
||||
Features robust caching with architecture/version awareness, timeout handling, and retry logic.
|
||||
|
||||
inputs:
|
||||
enable-retry:
|
||||
description: Enable retry logic with exponential backoff
|
||||
required: false
|
||||
default: "true"
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Detect Tesseract version (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
id: detect-tesseract-macos
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/detect-tesseract-macos.sh
|
||||
|
||||
- name: Cache Tesseract & tessdata (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
id: cache-tesseract-macos
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
/usr/local/opt/tesseract/
|
||||
/usr/local/Cellar/tesseract/
|
||||
/opt/homebrew/opt/tesseract/
|
||||
/opt/homebrew/Cellar/tesseract/
|
||||
key: tesseract-macos-${{ runner.arch }}-v5-${{ steps.detect-tesseract-macos.outputs.version }}
|
||||
restore-keys: |
|
||||
tesseract-macos-${{ runner.arch }}-v5-
|
||||
tesseract-macos-${{ runner.arch }}-
|
||||
|
||||
- name: Install dependencies (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/install-macos.sh
|
||||
|
||||
- name: Detect Tesseract version (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
id: detect-tesseract-linux
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/detect-tesseract-linux.sh
|
||||
|
||||
- name: Cache Tesseract data (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
id: cache-tesseract-linux
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
/usr/share/tesseract-ocr/5/tessdata/
|
||||
/usr/share/tesseract-ocr/tessdata/
|
||||
key: tesseract-linux-${{ runner.arch }}-v5-${{ steps.detect-tesseract-linux.outputs.version }}
|
||||
restore-keys: |
|
||||
tesseract-linux-${{ runner.arch }}-v5-
|
||||
tesseract-linux-${{ runner.arch }}-
|
||||
|
||||
- name: Install dependencies (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/install-linux.sh
|
||||
|
||||
- name: Cache Tesseract (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
id: cache-tesseract-windows
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
C:\Program Files\Tesseract-OCR
|
||||
C:\ProgramData\chocolatey\lib\tesseract
|
||||
key: tesseract-windows-${{ runner.arch }}-v5-data
|
||||
restore-keys: |
|
||||
tesseract-windows-${{ runner.arch }}-
|
||||
|
||||
- name: Cache LLVM (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
id: cache-llvm-windows
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
C:\Program Files\LLVM
|
||||
C:\ProgramData\chocolatey\lib\llvm
|
||||
key: llvm-windows-${{ runner.arch }}-v1
|
||||
|
||||
- name: Cache CMake (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
id: cache-cmake-windows
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
C:\Program Files\CMake
|
||||
C:\ProgramData\chocolatey\lib\cmake
|
||||
key: cmake-windows-${{ runner.arch }}-v1
|
||||
|
||||
- name: Install dependencies (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
env:
|
||||
TESSERACT_CACHE_HIT: ${{ steps.cache-tesseract-windows.outputs.cache-hit }}
|
||||
LLVM_CACHE_HIT: ${{ steps.cache-llvm-windows.outputs.cache-hit }}
|
||||
CMAKE_CACHE_HIT: ${{ steps.cache-cmake-windows.outputs.cache-hit }}
|
||||
run: pwsh -File scripts/ci/install-system-deps/install-windows.ps1
|
||||
197
.github/actions/setup-layout-models/action.yml
vendored
Normal file
197
.github/actions/setup-layout-models/action.yml
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
name: Setup Layout Detection Models Cache
|
||||
description: Download and cache layout detection ONNX models (RT-DETR + TATR) for CI testing
|
||||
|
||||
inputs:
|
||||
cache-enabled:
|
||||
description: Enable model caching (set to false for cross-arch builds)
|
||||
required: false
|
||||
default: "true"
|
||||
models:
|
||||
description: Comma-separated list of models to setup (rtdetr,tatr)
|
||||
required: false
|
||||
default: "rtdetr,tatr"
|
||||
cache-key-suffix:
|
||||
description: Suffix for cache key to differentiate model sets
|
||||
required: false
|
||||
default: "layout-models-v2"
|
||||
|
||||
outputs:
|
||||
cache-hit:
|
||||
description: Whether models were restored from cache (true/false)
|
||||
value: ${{ steps.cache-models.outputs.cache-hit }}
|
||||
cache-dir:
|
||||
description: Path to the layout model cache directory
|
||||
value: ${{ steps.set-outputs.outputs.cache-dir }}
|
||||
models-available:
|
||||
description: Comma-separated list of available models
|
||||
value: ${{ steps.verify-models.outputs.available-models }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Setup cache directory
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p ~/.cache/kreuzberg/layout
|
||||
echo "Cache directory: $HOME/.cache/kreuzberg/layout"
|
||||
|
||||
- name: Restore layout models from cache
|
||||
if: inputs.cache-enabled == 'true'
|
||||
uses: actions/cache@v5
|
||||
id: cache-models
|
||||
with:
|
||||
path: ~/.cache/kreuzberg/layout
|
||||
key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-rtdetr_3bf2fb0e+tatr_c11f4033
|
||||
restore-keys: |
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-
|
||||
${{ inputs.cache-key-suffix }}-
|
||||
|
||||
- name: Download RT-DETR model (rtdetr)
|
||||
if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/rtdetr/model.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
MODEL_DIR="$CACHE_DIR/rtdetr"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading RT-DETR layout detection model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "RT-DETR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ! -f "$MODEL_FILE" ]; then
|
||||
echo "ERROR: Failed to download RT-DETR model after 3 attempts"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify RT-DETR SHA256
|
||||
if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_FILE="$HOME/.cache/kreuzberg/layout/rtdetr/model.onnx"
|
||||
EXPECTED="3bf2fb0ee6df87435b7ae47f0f3930ec3dc97ec56fd824acc6d57bc7a6b89ef2"
|
||||
|
||||
if command -v sha256sum &>/dev/null; then
|
||||
ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
|
||||
else
|
||||
ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
|
||||
fi
|
||||
|
||||
if [ "$ACTUAL" != "$EXPECTED" ]; then
|
||||
echo "ERROR: RT-DETR SHA256 mismatch"
|
||||
echo " Expected: $EXPECTED"
|
||||
echo " Actual: $ACTUAL"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
fi
|
||||
echo "RT-DETR SHA256 verified"
|
||||
|
||||
- name: Download TATR model (tatr)
|
||||
if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/tatr/model.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
MODEL_DIR="$CACHE_DIR/tatr"
|
||||
MODEL_FILE="$MODEL_DIR/tatr.onnx"
|
||||
|
||||
echo "Downloading TATR table recognition model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "TATR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ! -f "$MODEL_FILE" ]; then
|
||||
echo "ERROR: Failed to download TATR model after 3 attempts"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify TATR SHA256
|
||||
if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_FILE="$HOME/.cache/kreuzberg/layout/tatr/tatr.onnx"
|
||||
EXPECTED="c11f4033da75e9c4d41c403ef356e89caa0a37a7d111b55461e7d5ba856bb6b6"
|
||||
|
||||
if command -v sha256sum &>/dev/null; then
|
||||
ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
|
||||
else
|
||||
ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
|
||||
fi
|
||||
|
||||
if [ "$ACTUAL" != "$EXPECTED" ]; then
|
||||
echo "ERROR: TATR SHA256 mismatch"
|
||||
echo " Expected: $EXPECTED"
|
||||
echo " Actual: $ACTUAL"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
fi
|
||||
echo "TATR SHA256 verified"
|
||||
|
||||
- name: Verify downloaded models
|
||||
id: verify-models
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
AVAILABLE_MODELS=()
|
||||
TOTAL_SIZE=0
|
||||
|
||||
echo "Checking for layout models in $CACHE_DIR"
|
||||
|
||||
if [ -f "$CACHE_DIR/rtdetr/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/rtdetr/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("rtdetr")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ RT-DETR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/tatr/tatr.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/tatr/tatr.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("tatr")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ TATR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
|
||||
echo "ERROR: No layout models found in cache directory after download"
|
||||
echo "available-models=" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
|
||||
AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
|
||||
echo "✓ Total cached layout models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
|
||||
echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set cache directory output
|
||||
id: set-outputs
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
|
||||
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
|
||||
echo "Layout model cache configured at: $CACHE_DIR"
|
||||
46
.github/actions/setup-onnx-runtime/action.yml
vendored
Normal file
46
.github/actions/setup-onnx-runtime/action.yml
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
name: Setup ONNX Runtime
|
||||
description: Download and stage ONNX Runtime libraries for bindings
|
||||
inputs:
|
||||
ort-version:
|
||||
description: ONNX Runtime version to download
|
||||
required: true
|
||||
dest-dir:
|
||||
description: Directory (relative to workspace) where libraries should be copied
|
||||
required: false
|
||||
default: crates/kreuzberg-node
|
||||
arch-id:
|
||||
description: Override architecture (x64|arm64). Defaults to runner architecture.
|
||||
required: false
|
||||
default: ""
|
||||
strategy:
|
||||
description: "ORT linking strategy: 'system' (dynamic link, default) or 'bundled' (static link via ort-bundled cargo feature)"
|
||||
required: false
|
||||
default: system
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Cache ONNX Runtime
|
||||
id: cache-onnx
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
${{ runner.temp }}/onnxruntime
|
||||
key: onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-${{ inputs.ort-version }}
|
||||
restore-keys: |
|
||||
onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-
|
||||
onnx-v2-${{ runner.os }}-
|
||||
|
||||
- name: Prepare ONNX Runtime (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-onnx-runtime/linux.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
|
||||
|
||||
- name: Prepare ONNX Runtime (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-onnx-runtime/macos.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
|
||||
|
||||
- name: Prepare ONNX Runtime (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
run: scripts/ci/actions/setup-onnx-runtime/windows.ps1 "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
|
||||
202
.github/actions/setup-paddle-ocr-models/README.md
vendored
Normal file
202
.github/actions/setup-paddle-ocr-models/README.md
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
# Setup PaddleOCR Models Cache
|
||||
|
||||
GitHub Action to download and cache PaddleOCR ONNX models for CI testing and development.
|
||||
|
||||
## Overview
|
||||
|
||||
This action manages the setup of PaddleOCR PP-OCRv5 ONNX models used by the `kreuzberg-paddle-ocr` crate for optical character recognition testing. It:
|
||||
|
||||
- Downloads three model types (detection, classification, recognition) from Hugging Face
|
||||
- Caches models per OS and CPU architecture (Linux x86_64, Linux ARM64, macOS, Windows)
|
||||
- Provides environment variables for downstream use
|
||||
- Outputs cache hit status and available model information
|
||||
- Gracefully handles download failures (continues with available models)
|
||||
|
||||
## Models
|
||||
|
||||
The action downloads pre-converted ONNX format models from the `Kreuzberg/paddleocr-onnx-models` Hugging Face repository:
|
||||
|
||||
| Model Type | File | Size | Purpose |
|
||||
| -------------------- | ------------------------------------- | ------- | ----------------------------------------- |
|
||||
| Detection (det) | `PP-OCRv5_server_det_infer.onnx` | ~84 MB | Text location detection (PP-OCRv5 server) |
|
||||
| Classification (cls) | `ch_ppocr_mobile_v2.0_cls_infer.onnx` | ~0.6 MB | Text orientation classification |
|
||||
| Recognition (rec) | `rec/english/model.onnx` | ~8 MB | Text character recognition (PP-OCRv5) |
|
||||
|
||||
**Total cache size: ~93 MB per OS/architecture combination**
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
```
|
||||
|
||||
### With Custom Cache Suffix
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
with:
|
||||
cache-key-suffix: my-paddle-ocr-v5
|
||||
```
|
||||
|
||||
### Disable Caching
|
||||
|
||||
For cross-architecture builds where caching doesn't help:
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
with:
|
||||
cache-enabled: false
|
||||
```
|
||||
|
||||
### Download Specific Models Only
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
with:
|
||||
models: "det,rec" # Skip classification model
|
||||
```
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Required | Default |
|
||||
| ------------------ | --------------------------------------------------------------- | -------- | -------------------- |
|
||||
| `cache-enabled` | Enable model caching (set false for cross-arch builds) | No | `true` |
|
||||
| `models` | Comma-separated list of models to setup (det,cls,rec or subset) | No | `det,cls,rec` |
|
||||
| `cache-key-suffix` | Suffix for cache key to differentiate model sets | No | `paddle-ocr-v5-onnx` |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | ---------------------------------------------------- |
|
||||
| `cache-hit` | Whether models were restored from cache (true/false) |
|
||||
| `cache-dir` | Path to the PaddleOCR model cache directory |
|
||||
| `models-available` | Comma-separated list of available models after setup |
|
||||
|
||||
## Outputs as Environment Variables
|
||||
|
||||
The action automatically exports:
|
||||
|
||||
- `PADDLE_OCR_MODEL_CACHE`: Absolute path to model cache directory
|
||||
|
||||
## Cache Strategy
|
||||
|
||||
Models are cached using GitHub Actions cache with the following key structure:
|
||||
|
||||
```text
|
||||
paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4
|
||||
```
|
||||
|
||||
Cache restoration order (restore-keys):
|
||||
|
||||
1. Exact match: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4`
|
||||
2. OS-Architecture: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-`
|
||||
3. OS only: `paddle-ocr-v5-onnx-{OS}-`
|
||||
4. Any: `paddle-ocr-v5-onnx-`
|
||||
|
||||
## Example: CI Rust Workflow Integration
|
||||
|
||||
```yaml
|
||||
jobs:
|
||||
paddle-ocr-tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
id: paddle-models
|
||||
|
||||
- name: Run PaddleOCR tests
|
||||
run: cargo test --package kreuzberg-paddle-ocr
|
||||
env:
|
||||
PADDLE_OCR_MODEL_CACHE: ${{ steps.paddle-models.outputs.cache-dir }}
|
||||
|
||||
- name: Report cache status
|
||||
if: always()
|
||||
run: |
|
||||
echo "Cache hit: ${{ steps.paddle-models.outputs.cache-hit }}"
|
||||
echo "Available models: ${{ steps.paddle-models.outputs.models-available }}"
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The action downloads models sequentially and will fail if a required model download fails. After downloading:
|
||||
|
||||
- The verify step reports which models are actually available in the output
|
||||
- Downstream tests can check `models-available` to know what's available
|
||||
- If all models fail, tests can fall back to alternative behavior
|
||||
|
||||
## Download Sources
|
||||
|
||||
Models are downloaded from:
|
||||
|
||||
```text
|
||||
https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/
|
||||
```
|
||||
|
||||
If this repository becomes unavailable, the action will fail gracefully. Alternative sources can be configured by modifying the `MODEL_URL` environment variables in the action.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Models not being cached
|
||||
|
||||
1. Check that `cache-enabled` is not set to `false`
|
||||
2. Verify GitHub Actions cache is not full (max 10 GB per repository)
|
||||
3. Check runner OS and architecture match cache keys
|
||||
4. View cache in repository settings (Settings → Actions → Caches)
|
||||
|
||||
### Download timeouts
|
||||
|
||||
If downloads timeout:
|
||||
|
||||
- Increase the 300-second timeout in the action steps
|
||||
- Check Hugging Face API availability
|
||||
- Try reducing the number of models (`models: "det,rec"`)
|
||||
|
||||
### Verifying models are present
|
||||
|
||||
Check that all expected models exist in the correct directory structure:
|
||||
|
||||
```bash
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```text
|
||||
drwxr-xr-x det/
|
||||
drwxr-xr-x cls/
|
||||
drwxr-xr-x rec/
|
||||
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/det/
|
||||
-rw-r--r-- model.onnx (~84 MB)
|
||||
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/cls/
|
||||
-rw-r--r-- model.onnx (~0.6 MB)
|
||||
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/rec/english/
|
||||
-rw-r--r-- model.onnx (~8 MB)
|
||||
-rw-r--r-- dict.txt
|
||||
```
|
||||
|
||||
The directory structure must match what `ModelManager` expects in `model_manager.rs`.
|
||||
|
||||
## Performance Impact
|
||||
|
||||
- **First run (no cache)**: ~30-60 seconds (download time depends on network)
|
||||
- **Cached run**: <1 second (cache restore)
|
||||
- **Cache size**: ~93 MB per OS/architecture
|
||||
- **Network bandwidth**: ~93 MB download on cache miss
|
||||
|
||||
## Related Actions
|
||||
|
||||
- `.github/actions/setup-tesseract-cache` - Similar caching for Tesseract models
|
||||
- `.github/actions/cache-hf-fastembed` - Hugging Face model caching for fastembed
|
||||
- `.github/actions/setup-onnx-runtime` - ONNX Runtime setup for inference
|
||||
|
||||
## See Also
|
||||
|
||||
- [PaddleOCR Documentation](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
- [kreuzberg-paddle-ocr crate](../../../crates/kreuzberg-paddle-ocr)
|
||||
- [ModelManager source](../../../crates/kreuzberg/src/paddle_ocr/model_manager.rs)
|
||||
231
.github/actions/setup-paddle-ocr-models/action.yml
vendored
Normal file
231
.github/actions/setup-paddle-ocr-models/action.yml
vendored
Normal file
@@ -0,0 +1,231 @@
|
||||
name: Setup PaddleOCR Models Cache
|
||||
description: Download and cache PaddleOCR ONNX models for CI testing
|
||||
|
||||
inputs:
|
||||
cache-enabled:
|
||||
description: Enable model caching (set to false for cross-arch builds)
|
||||
required: false
|
||||
default: "true"
|
||||
models:
|
||||
description: Comma-separated list of models to setup (det,cls,rec or specific subset)
|
||||
required: false
|
||||
default: "det,cls,rec"
|
||||
cache-key-suffix:
|
||||
description: Suffix for cache key to differentiate model sets
|
||||
required: false
|
||||
default: "paddle-ocr-v5-onnx"
|
||||
|
||||
outputs:
|
||||
cache-hit:
|
||||
description: Whether models were restored from cache (true/false)
|
||||
value: ${{ steps.cache-models.outputs.cache-hit }}
|
||||
cache-dir:
|
||||
description: Path to the PaddleOCR model cache directory
|
||||
value: ${{ steps.set-outputs.outputs.cache-dir }}
|
||||
models-available:
|
||||
description: Comma-separated list of available models
|
||||
value: ${{ steps.verify-models.outputs.available-models }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Setup cache directory
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p ~/.cache/kreuzberg/paddle-ocr
|
||||
echo "Cache directory: $HOME/.cache/kreuzberg/paddle-ocr"
|
||||
|
||||
- name: Restore PaddleOCR models from cache
|
||||
if: inputs.cache-enabled == 'true'
|
||||
uses: actions/cache@v5
|
||||
id: cache-models
|
||||
with:
|
||||
path: ~/.cache/kreuzberg/paddle-ocr
|
||||
key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-v4
|
||||
restore-keys: |
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-
|
||||
${{ inputs.cache-key-suffix }}-
|
||||
|
||||
- name: Download detection model (det)
|
||||
if: contains(inputs.models, 'det') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/PP-OCRv5_server_det_infer.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/det"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading detection model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "Detection model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download detection model after 3 attempts"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Download classification model (cls)
|
||||
if: contains(inputs.models, 'cls') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/ch_ppocr_mobile_v2.0_cls_infer.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/cls"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading classification model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "Classification model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download classification model after 3 attempts"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Download recognition model (rec/english)
|
||||
if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/model.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/rec/english"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading English recognition model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "Recognition model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download recognition model after 3 attempts"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Download recognition dictionary (rec/english/dict.txt)
|
||||
if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
DICT_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/dict.txt"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/rec/english"
|
||||
DICT_FILE="$MODEL_DIR/dict.txt"
|
||||
|
||||
echo "Downloading English recognition dictionary from $DICT_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$DICT_FILE" "$DICT_URL"; then
|
||||
echo "Dictionary downloaded successfully ($(du -h "$DICT_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download dictionary after 3 attempts"
|
||||
rm -f "$DICT_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Verify downloaded models
|
||||
id: verify-models
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
AVAILABLE_MODELS=()
|
||||
TOTAL_SIZE=0
|
||||
|
||||
echo "Checking for PaddleOCR models in $CACHE_DIR"
|
||||
|
||||
if [ -f "$CACHE_DIR/det/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/det/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("det")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Detection model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/cls/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/cls/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("cls")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Classification model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/rec/english/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/rec/english/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("rec")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Recognition model (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/rec/english/dict.txt" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/rec/english/dict.txt" | tr -d ' ')
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Recognition dictionary (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
|
||||
echo "ERROR: No models found in cache directory after download"
|
||||
echo "available-models=" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
|
||||
AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
|
||||
echo "✓ Total cached models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
|
||||
echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set cache directory output
|
||||
id: set-outputs
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
|
||||
echo "PADDLE_OCR_MODEL_CACHE=$CACHE_DIR" >> $GITHUB_ENV
|
||||
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
|
||||
|
||||
- name: Export cache environment
|
||||
shell: bash
|
||||
run: |
|
||||
echo "PADDLE_OCR_MODEL_CACHE=$HOME/.cache/kreuzberg/paddle-ocr" >> $GITHUB_ENV
|
||||
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
|
||||
echo "PaddleOCR model cache configured at: $HOME/.cache/kreuzberg/paddle-ocr"
|
||||
60
.github/actions/setup-tesseract-cache/action.yml
vendored
Normal file
60
.github/actions/setup-tesseract-cache/action.yml
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
name: Setup Tesseract Cache
|
||||
description: Manages kreuzberg-tesseract build cache per architecture
|
||||
|
||||
inputs:
|
||||
label:
|
||||
description: Platform label (e.g. linux-x86_64, linux-aarch64)
|
||||
required: true
|
||||
enable-cache:
|
||||
description: Enable tesseract caching (disable for cross-arch builds)
|
||||
required: false
|
||||
default: "true"
|
||||
rust-target:
|
||||
description: Rust target triple for per-target cache cleanup
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
outputs:
|
||||
cache-dir:
|
||||
description: Tesseract cache directory path
|
||||
value: ${{ steps.set-outputs.outputs.cache-dir }}
|
||||
cache-enabled:
|
||||
description: Whether caching is enabled (true/false)
|
||||
value: ${{ steps.set-outputs.outputs.cache-enabled }}
|
||||
docker-options:
|
||||
description: Docker options for passing cache env vars
|
||||
value: ${{ steps.set-outputs.outputs.docker-options }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Clean cache directories (cache disabled)
|
||||
if: inputs.enable-cache != 'true'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/clean-dirs.sh "${{ inputs.label }}"
|
||||
|
||||
- name: Setup cache directories
|
||||
if: inputs.enable-cache == 'true'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/setup-dirs.sh "${{ inputs.label }}"
|
||||
|
||||
- name: Cache kreuzberg-tesseract build cache
|
||||
if: inputs.enable-cache == 'true'
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
.tesseract-cache/${{ inputs.label }}
|
||||
.xdg-cache/${{ inputs.label }}
|
||||
key: kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-${{ hashFiles('crates/kreuzberg-tesseract/Cargo.toml', 'crates/kreuzberg-tesseract/build.rs') }}
|
||||
restore-keys: |
|
||||
kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-
|
||||
|
||||
- name: Clean per-target Tesseract cache
|
||||
if: inputs.rust-target != ''
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/clean-target-cache.sh "${{ inputs.rust-target }}"
|
||||
|
||||
- name: Set outputs and environment
|
||||
id: set-outputs
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/set-outputs.sh "${{ inputs.label }}" "${{ inputs.enable-cache }}"
|
||||
67
.github/dependabot.yaml
vendored
Normal file
67
.github/dependabot.yaml
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
version: 2
|
||||
|
||||
multi-ecosystem-groups:
|
||||
dependencies:
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
|
||||
updates:
|
||||
- package-ecosystem: "cargo"
|
||||
# Explicitly list root only — packages/ruby/ext and packages/r/src have
|
||||
# standalone workspaces with path deps to vendored crates that only exist
|
||||
# at build time. Dependabot cannot resolve these paths.
|
||||
directories:
|
||||
- "/"
|
||||
ignore:
|
||||
- dependency-name: "kreuzberg"
|
||||
- dependency-name: "kreuzberg-ffi"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "pip"
|
||||
directories:
|
||||
- "/"
|
||||
- "/packages/python"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "npm"
|
||||
directories:
|
||||
- "/"
|
||||
- "/crates/kreuzberg-node"
|
||||
- "/crates/kreuzberg-wasm"
|
||||
- "/packages/typescript/core"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "bundler"
|
||||
directory: "/packages/ruby"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "composer"
|
||||
directories:
|
||||
- "/"
|
||||
- "/packages/php"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "gomod"
|
||||
directory: "/packages/go/v5"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "maven"
|
||||
directory: "/packages/java"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "nuget"
|
||||
directory: "/packages/csharp"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "mix"
|
||||
directory: "/packages/elixir"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
39
.github/documentation/runners.md
vendored
Normal file
39
.github/documentation/runners.md
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
# Custom GitHub Actions Runners
|
||||
|
||||
## Available Runners
|
||||
|
||||
| Runner Label | Architecture | Size | Ephemeral | Notes |
|
||||
| -------------------------- | ------------ | ------ | --------- | ---------------------------------------------------------- |
|
||||
| `runner-small` | x86_64 | Small | No | Light tasks: linting, formatting, validation |
|
||||
| `runner-medium` | x86_64 | Medium | No | Standard CI: tests, builds |
|
||||
| `runner-medium-arm64` | arm64 | Medium | No | ARM64 builds and tests |
|
||||
| `runner-large` | x86_64 | Large | No | Heavy workloads: benchmarks, coverage, release builds |
|
||||
| `runner-large-spot` | x86_64 | Large | Yes | Cost-optimized large jobs where interruption is acceptable |
|
||||
| `runner-medium-arm64-spot` | arm64 | Medium | Yes | Cost-optimized ARM64 jobs where interruption is acceptable |
|
||||
|
||||
## Spot Runners
|
||||
|
||||
Spot runners (`*-spot`) use ephemeral cloud instances provisioned on a best-effort basis. They are significantly cheaper but can be preempted at any time if the cloud provider reclaims capacity.
|
||||
|
||||
**Use spot runners for:**
|
||||
|
||||
- Jobs that can be retried without consequence (test suites, linting)
|
||||
- Non-time-critical workloads
|
||||
- PR validation where re-runs are acceptable
|
||||
|
||||
**Do not use spot runners for:**
|
||||
|
||||
- Benchmarks (preemption and noisy-neighbor effects skew results)
|
||||
- Release builds and publishing
|
||||
- Jobs requiring consistent, reproducible timing
|
||||
|
||||
## Choosing a Runner
|
||||
|
||||
| Workload | Recommended Runner |
|
||||
| ------------------------------- | -------------------------- |
|
||||
| Linting, formatting, validation | `runner-small` |
|
||||
| Unit tests, standard builds | `runner-medium` |
|
||||
| ARM64 cross-compilation / tests | `runner-medium-arm64` |
|
||||
| Benchmarks, coverage reports | `runner-large` |
|
||||
| Non-critical large builds | `runner-large-spot` |
|
||||
| Non-critical ARM64 builds | `runner-medium-arm64-spot` |
|
||||
1244
.github/workflows/benchmarks.yaml
vendored
Normal file
1244
.github/workflows/benchmarks.yaml
vendored
Normal file
File diff suppressed because it is too large
Load Diff
74
.github/workflows/build-node-native.yml
vendored
Normal file
74
.github/workflows/build-node-native.yml
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
name: Build Node Native
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/kreuzberg-node/**"
|
||||
- "crates/kreuzberg/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/build-node-native.yml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/kreuzberg-node/**"
|
||||
- "crates/kreuzberg/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/build-node-native.yml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: build-node-native-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Build ${{ matrix.target }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-latest
|
||||
target: x86_64-unknown-linux-gnu
|
||||
- os: ubuntu-24.04-arm
|
||||
target: aarch64-unknown-linux-gnu
|
||||
- os: macos-13
|
||||
target: x86_64-apple-darwin
|
||||
- os: macos-latest
|
||||
target: aarch64-apple-darwin
|
||||
- os: windows-latest
|
||||
target: x86_64-pc-windows-msvc
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6.0.2
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
target: ${{ matrix.target }}
|
||||
|
||||
- uses: kreuzberg-dev/actions/setup-node-workspace@v1
|
||||
with:
|
||||
node-version: "24"
|
||||
|
||||
- name: Build NAPI binding
|
||||
uses: kreuzberg-dev/actions/build-node-napi@v1
|
||||
with:
|
||||
crate-dir: crates/kreuzberg-node
|
||||
build-command: pnpm exec napi build --release --target ${{ matrix.target }} --platform
|
||||
79
.github/workflows/ci-docker.yaml
vendored
Normal file
79
.github/workflows/ci-docker.yaml
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
name: CI Docker
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-docker-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
ORT_VERSION: "1.24.2"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
name: Docker (${{ matrix.variant }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
variant: [core, full, cli]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Free disk space
|
||||
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v4
|
||||
|
||||
- name: Build Docker image
|
||||
uses: docker/build-push-action@v7
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.${{ matrix.variant }}
|
||||
push: false
|
||||
load: true
|
||||
tags: kreuzberg:${{ matrix.variant }}
|
||||
build-args: ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }}
|
||||
cache-from: type=gha,scope=ci-docker-${{ matrix.variant }}
|
||||
cache-to: type=gha,mode=max,scope=ci-docker-${{ matrix.variant }}
|
||||
|
||||
- name: Save Docker image
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /tmp
|
||||
docker save kreuzberg:${{ matrix.variant }} | gzip > /tmp/kreuzberg-${{ matrix.variant }}.tar.gz
|
||||
ls -lh /tmp/kreuzberg-${{ matrix.variant }}.tar.gz
|
||||
|
||||
- name: Check image size
|
||||
uses: kreuzberg-dev/actions/check-docker-image-size@v1
|
||||
with:
|
||||
image: kreuzberg:${{ matrix.variant }}
|
||||
warn-mb: ${{ matrix.variant == 'cli' && '200' || '' }}
|
||||
label: "${{ matrix.variant }} image"
|
||||
|
||||
- name: Run feature tests
|
||||
if: matrix.variant != 'cli'
|
||||
run: scripts/ci/docker/run-feature-tests.sh "${{ matrix.variant }}"
|
||||
|
||||
- name: Run configuration tests
|
||||
if: matrix.variant != 'cli'
|
||||
run: scripts/ci/docker/run-config-tests.sh "${{ matrix.variant }}"
|
||||
|
||||
- name: Run API contract tests with schemathesis
|
||||
if: matrix.variant != 'cli'
|
||||
uses: kreuzberg-dev/actions/run-api-contract-tests@v1
|
||||
with:
|
||||
image: kreuzberg:${{ matrix.variant }}
|
||||
port: "8000"
|
||||
|
||||
- name: Run CLI tests
|
||||
if: matrix.variant == 'cli'
|
||||
run: scripts/ci/docker/run-cli-tests.sh
|
||||
102
.github/workflows/ci-docs.yaml
vendored
Normal file
102
.github/workflows/ci-docs.yaml
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
name: CI Docs
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "docs/**"
|
||||
- "packages/**/README.md"
|
||||
- "crates/*/README.md"
|
||||
- "packages/python/pyproject.toml"
|
||||
- "packages/typescript/package.json"
|
||||
- "packages/ruby/kreuzberg.gemspec"
|
||||
- "packages/php/composer.json"
|
||||
- "packages/go/v5/go.mod"
|
||||
- "packages/java/pom.xml"
|
||||
- "packages/csharp/**/Kreuzberg.csproj"
|
||||
- "packages/elixir/mix.exs"
|
||||
- "packages/r/DESCRIPTION"
|
||||
- "packages/dart/pubspec.yaml"
|
||||
- "zensical.toml"
|
||||
- "mkdocs.yml"
|
||||
- "alef.toml"
|
||||
- ".github/workflows/ci-docs.yaml"
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "docs/**"
|
||||
- "packages/**/README.md"
|
||||
- "crates/*/README.md"
|
||||
- "zensical.toml"
|
||||
- "pyproject.toml"
|
||||
- "alef.toml"
|
||||
- "CHANGELOG.md"
|
||||
- ".github/workflows/ci-docs.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: ci-docs-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Lint documentation + validate snippets
|
||||
uses: kreuzberg-dev/actions/lint-docs@v1
|
||||
with:
|
||||
working-directory: .
|
||||
strict: "true"
|
||||
validate-snippets: "true"
|
||||
alef-ref: v0.19.5
|
||||
|
||||
build:
|
||||
name: Build
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build documentation
|
||||
uses: kreuzberg-dev/actions/build-docs@v1
|
||||
with:
|
||||
working-directory: .
|
||||
strict: "true"
|
||||
|
||||
- name: Upload site artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
with:
|
||||
name: docs-site
|
||||
path: site/
|
||||
retention-days: 1
|
||||
|
||||
deploy:
|
||||
name: Deploy
|
||||
needs: [build, lint]
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Download site artifact
|
||||
uses: actions/download-artifact@v8
|
||||
with:
|
||||
name: docs-site
|
||||
path: site/
|
||||
|
||||
- name: Upload Pages artifact
|
||||
uses: actions/upload-pages-artifact@v5
|
||||
with:
|
||||
path: site
|
||||
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v5
|
||||
345
.github/workflows/ci-e2e.yaml
vendored
Normal file
345
.github/workflows/ci-e2e.yaml
vendored
Normal file
@@ -0,0 +1,345 @@
|
||||
name: CI E2E
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/**"
|
||||
- "e2e/**"
|
||||
- "fixtures/**"
|
||||
- "alef.toml"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "Taskfile.yml"
|
||||
- ".github/workflows/ci-e2e.yaml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/**"
|
||||
- "e2e/**"
|
||||
- "fixtures/**"
|
||||
- "alef.toml"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "Taskfile.yml"
|
||||
- ".github/workflows/ci-e2e.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-e2e-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
RUST_MIN_STACK: 16777216
|
||||
ORT_VERSION: "1.24.2"
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
BUILD_PROFILE: "ci"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-ffi:
|
||||
name: Build FFI (${{ matrix.target }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: ${{ matrix.os == 'windows-latest' && 120 || 60 }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-24.04-arm
|
||||
target: aarch64-unknown-linux-gnu
|
||||
- os: ubuntu-latest
|
||||
target: x86_64-unknown-linux-gnu
|
||||
- os: macos-latest
|
||||
target: aarch64-apple-darwin
|
||||
- os: windows-latest
|
||||
target: x86_64-pc-windows-msvc
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: build-ffi-${{ matrix.target }}
|
||||
target: ${{ matrix.target }}
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup OpenSSL
|
||||
uses: kreuzberg-dev/actions/setup-openssl@v1
|
||||
|
||||
- name: Build FFI library
|
||||
uses: kreuzberg-dev/actions/build-rust-ffi@v1
|
||||
with:
|
||||
crate-name: kreuzberg-ffi
|
||||
|
||||
- name: Build CLI
|
||||
uses: kreuzberg-dev/actions/build-rust-cli@v1
|
||||
with:
|
||||
package-name: kreuzberg-cli
|
||||
binary-name: kreuzberg
|
||||
extra-cargo-args: --features all
|
||||
|
||||
- name: Upload FFI artifacts
|
||||
uses: actions/upload-artifact@v7
|
||||
with:
|
||||
name: ffi-${{ matrix.target }}
|
||||
path: |
|
||||
target/release/libkreuzberg_ffi.*
|
||||
target/release/kreuzberg_ffi.*
|
||||
crates/kreuzberg-ffi/include/kreuzberg.h
|
||||
crates/kreuzberg-ffi/kreuzberg-ffi.pc
|
||||
crates/kreuzberg-ffi/cmake/
|
||||
target/release/kreuzberg
|
||||
target/release/kreuzberg.exe
|
||||
retention-days: 7
|
||||
if-no-files-found: error
|
||||
|
||||
e2e-tests:
|
||||
name: E2E (${{ matrix.lang }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
needs: [build-ffi]
|
||||
runs-on: ubuntu-24.04-arm
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- lang: python
|
||||
python-version: "3.13"
|
||||
test-cmd: "pip install maturin && cd packages/python && maturin develop --release && cd ../../e2e/python && python3 -m pytest tests/ -q"
|
||||
- lang: node
|
||||
node-version: "24"
|
||||
test-cmd: "cd crates/kreuzberg-node && npm run build && cd ../../e2e/node && npx vitest run"
|
||||
- lang: go
|
||||
go-version: "1.26"
|
||||
test-cmd: "cd e2e/go && go test ./... -count=1 -v"
|
||||
- lang: ruby
|
||||
ruby-version: "3.4"
|
||||
test-cmd: "cd e2e/ruby && bundle exec rspec"
|
||||
- lang: java
|
||||
java-version: "25"
|
||||
test-cmd: "cd packages/java && mvn -q package -DskipTests && cd ../../e2e/java && mvn test -q"
|
||||
- lang: csharp
|
||||
dotnet-version: "10.0.x"
|
||||
test-cmd: "cd e2e/csharp && dotnet test"
|
||||
- lang: php
|
||||
php-version: "8.4"
|
||||
test-cmd: 'cd crates/kreuzberg-php && cargo build --release && echo "extension=$(pwd)/../../target/release/libkreuzberg_php.so" | sudo tee -a "$(php -r ''echo php_ini_loaded_file();'')" >/dev/null && cd ../../e2e/php && composer install -q && vendor/bin/phpunit'
|
||||
- lang: elixir
|
||||
elixir-version: "1.19"
|
||||
otp-version: "28"
|
||||
test-cmd: "cd e2e/elixir && KREUZBERG_BUILD=true mix deps.get && KREUZBERG_BUILD=true mix test"
|
||||
- lang: wasm
|
||||
node-version: "24"
|
||||
test-cmd: 'curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh && export PATH="$HOME/.cargo/bin:$PATH" && export RUSTFLAGS=''--cfg getrandom_backend="wasm_js"'' && cd crates/kreuzberg-wasm && wasm-pack build --release --target web --out-dir ../../packages/wasm/pkg && cd ../../e2e/wasm && npm install && npm test'
|
||||
- lang: rust
|
||||
test-cmd: "cd e2e/rust && cargo test"
|
||||
- lang: r
|
||||
r-version: "4.3"
|
||||
test-cmd: "cd e2e/r && Rscript run_tests.R"
|
||||
- lang: dart
|
||||
dart-version: "3.11"
|
||||
test-cmd: "cargo build --release -p kreuzberg-dart && mkdir -p packages/dart/rust/target/release && cp target/release/libkreuzberg_dart.* packages/dart/rust/target/release/ 2>/dev/null || true && cd packages/dart && dart pub get && cd ../../e2e/dart && dart pub get && dart test"
|
||||
- lang: kotlin_android
|
||||
java-version: "25"
|
||||
test-cmd: "cd e2e/kotlin_android && gradle test --no-daemon"
|
||||
- lang: swift
|
||||
swift-version: "6.0"
|
||||
test-cmd: "cd e2e/swift_e2e && swift test"
|
||||
- lang: zig
|
||||
zig-version: "0.16.0"
|
||||
test-cmd: 'FFI_ABS="$PWD/target/release" && cd e2e/zig && zig build test -Dffi_path="$FFI_ABS"'
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: e2e-${{ matrix.lang }}
|
||||
|
||||
- name: Download FFI artifacts
|
||||
uses: actions/download-artifact@v8
|
||||
with:
|
||||
name: ffi-aarch64-unknown-linux-gnu
|
||||
path: ffi-artifacts
|
||||
|
||||
- name: Stage FFI artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p target/release crates/kreuzberg-ffi/include crates/kreuzberg-ffi/cmake
|
||||
if [ -d ffi-artifacts/target/release ]; then
|
||||
cp -r ffi-artifacts/target/release/. target/release/
|
||||
fi
|
||||
if [ -d ffi-artifacts/crates/kreuzberg-ffi/include ]; then
|
||||
cp -r ffi-artifacts/crates/kreuzberg-ffi/include/. crates/kreuzberg-ffi/include/
|
||||
fi
|
||||
if [ -d ffi-artifacts/crates/kreuzberg-ffi/cmake ]; then
|
||||
cp -r ffi-artifacts/crates/kreuzberg-ffi/cmake/. crates/kreuzberg-ffi/cmake/
|
||||
fi
|
||||
if [ -f ffi-artifacts/crates/kreuzberg-ffi/kreuzberg-ffi.pc ]; then
|
||||
cp ffi-artifacts/crates/kreuzberg-ffi/kreuzberg-ffi.pc crates/kreuzberg-ffi/
|
||||
fi
|
||||
chmod +x target/release/libkreuzberg_ffi.so 2>/dev/null || true
|
||||
ls -la target/release/
|
||||
if [ -f target/release/libkreuzberg_ffi.so ]; then
|
||||
sudo cp target/release/libkreuzberg_ffi.so /usr/local/lib/
|
||||
sudo ldconfig
|
||||
fi
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup OpenSSL
|
||||
uses: kreuzberg-dev/actions/setup-openssl@v1
|
||||
|
||||
- name: Setup ONNX Runtime
|
||||
uses: ./.github/actions/setup-onnx-runtime
|
||||
with:
|
||||
ort-version: ${{ env.ORT_VERSION }}
|
||||
|
||||
- name: Setup Tesseract cache
|
||||
uses: ./.github/actions/setup-tesseract-cache
|
||||
with:
|
||||
label: e2e-${{ matrix.lang }}
|
||||
|
||||
- name: Install WASI SDK
|
||||
if: matrix.lang == 'wasm'
|
||||
uses: kreuzberg-dev/actions/install-wasi-sdk@v1
|
||||
|
||||
- name: Setup Python
|
||||
if: matrix.python-version
|
||||
uses: kreuzberg-dev/actions/setup-python-env@v1
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache-prefix: e2e-py-${{ matrix.python-version }}
|
||||
|
||||
- name: Setup Node
|
||||
if: matrix.node-version
|
||||
uses: kreuzberg-dev/actions/setup-node-workspace@v1
|
||||
|
||||
- name: Setup Go
|
||||
if: matrix.go-version
|
||||
uses: actions/setup-go@v6
|
||||
with:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
|
||||
- name: Setup Ruby
|
||||
if: matrix.ruby-version
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: ${{ matrix.ruby-version }}
|
||||
bundler-cache: true
|
||||
working-directory: e2e/ruby
|
||||
|
||||
- name: Setup Java
|
||||
if: matrix.java-version
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
distribution: temurin
|
||||
java-version: ${{ matrix.java-version }}
|
||||
|
||||
- name: Setup Android SDK
|
||||
if: matrix.lang == 'kotlin_android'
|
||||
uses: android-actions/setup-android@v3
|
||||
with:
|
||||
api-level: 35
|
||||
build-tools-version: "35.0.0"
|
||||
|
||||
- name: Setup Gradle
|
||||
if: matrix.lang == 'kotlin_android'
|
||||
uses: kreuzberg-dev/actions/setup-gradle@v1
|
||||
with:
|
||||
gradle-version: "9.1.0"
|
||||
|
||||
- name: Setup .NET
|
||||
if: matrix.dotnet-version
|
||||
uses: actions/setup-dotnet@v5
|
||||
with:
|
||||
dotnet-version: ${{ matrix.dotnet-version }}
|
||||
|
||||
- name: Setup PHP
|
||||
if: matrix.php-version
|
||||
uses: kreuzberg-dev/actions/setup-php@v1
|
||||
with:
|
||||
php-version: ${{ matrix.php-version }}
|
||||
tools: composer
|
||||
|
||||
- name: Setup Elixir
|
||||
if: matrix.elixir-version
|
||||
uses: kreuzberg-dev/actions/setup-elixir@v1
|
||||
with:
|
||||
elixir-version: ${{ matrix.elixir-version }}
|
||||
otp-version: ${{ matrix.otp-version }}
|
||||
|
||||
- name: Setup R
|
||||
if: matrix.r-version
|
||||
uses: kreuzberg-dev/actions/setup-r@v1
|
||||
with:
|
||||
r-version: ${{ matrix.r-version }}
|
||||
|
||||
- name: Install R test packages
|
||||
if: matrix.lang == 'r'
|
||||
run: R -e 'install.packages(c("testthat","jsonlite","devtools"), repos="https://cloud.r-project.org")'
|
||||
|
||||
- name: Setup Dart
|
||||
if: matrix.dart-version
|
||||
uses: dart-lang/setup-dart@v1
|
||||
with:
|
||||
sdk: ${{ matrix.dart-version }}
|
||||
|
||||
- name: Setup Swift
|
||||
if: matrix.swift-version
|
||||
uses: kreuzberg-dev/actions/setup-swift@v1
|
||||
with:
|
||||
swift-version: ${{ matrix.swift-version }}
|
||||
|
||||
- name: Setup Zig
|
||||
if: matrix.zig-version
|
||||
uses: kreuzberg-dev/actions/setup-zig@v1
|
||||
with:
|
||||
version: ${{ matrix.zig-version }}
|
||||
|
||||
- name: Setup library paths for FFI bindings
|
||||
if: |
|
||||
matrix.lang == 'go' || matrix.lang == 'java' ||
|
||||
matrix.lang == 'csharp' || matrix.lang == 'elixir' ||
|
||||
matrix.lang == 'r' || matrix.lang == 'kotlin_android' ||
|
||||
matrix.lang == 'swift' || matrix.lang == 'zig'
|
||||
shell: bash
|
||||
run: |
|
||||
export PKG_CONFIG_PATH="${PWD}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH}"
|
||||
export LD_LIBRARY_PATH="${PWD}/target/release:${LD_LIBRARY_PATH}"
|
||||
echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH}" >> "$GITHUB_ENV"
|
||||
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Install Task
|
||||
uses: kreuzberg-dev/actions/install-task@v1
|
||||
|
||||
- name: Compile Ruby native extension
|
||||
if: matrix.lang == 'ruby'
|
||||
working-directory: packages/ruby
|
||||
run: bundle install && bundle exec rake compile
|
||||
|
||||
- name: Run tests
|
||||
run: ${{ matrix.test-cmd }}
|
||||
shell: bash
|
||||
env:
|
||||
PKG_CONFIG_PATH: ${{ env.PKG_CONFIG_PATH }}
|
||||
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}
|
||||
DYLD_LIBRARY_PATH: ${{ env.DYLD_LIBRARY_PATH || '' }}
|
||||
TESSDATA_PREFIX: "/usr/share/tesseract-ocr/5/tessdata"
|
||||
112
.github/workflows/ci-gpu.yaml
vendored
Normal file
112
.github/workflows/ci-gpu.yaml
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
name: CI GPU
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-gpu-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
RUST_MIN_STACK: 16777216
|
||||
ORT_VERSION: "1.24.2"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build test binary"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup Rust
|
||||
uses: dtolnay/rust-toolchain@stable
|
||||
with:
|
||||
toolchain: "1.95"
|
||||
|
||||
- name: Cache Cargo
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
target
|
||||
key: gpu-build-${{ runner.os }}-${{ hashFiles('Cargo.lock') }}
|
||||
restore-keys: |
|
||||
gpu-build-${{ runner.os }}-
|
||||
|
||||
- name: Build GPU test binary
|
||||
uses: kreuzberg-dev/actions/build-gpu-test-binary@v1
|
||||
with:
|
||||
package: kreuzberg
|
||||
test-name: gpu_acceleration
|
||||
features: "paddle-ocr,layout-detection,embeddings,pdf,ocr,ort-dynamic"
|
||||
output-name: gpu-acceleration-test
|
||||
|
||||
- name: Upload test binary
|
||||
uses: actions/upload-artifact@v7
|
||||
with:
|
||||
name: gpu-test-binary
|
||||
path: gpu-acceleration-test
|
||||
retention-days: 1
|
||||
|
||||
gpu-tests:
|
||||
name: "GPU Tests (CUDA)"
|
||||
needs: build
|
||||
runs-on: runner-gpu-l4
|
||||
timeout-minutes: 15
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Verify GPU
|
||||
run: |
|
||||
nvidia-smi || {
|
||||
echo "ERROR: nvidia-smi failed — no GPU detected"
|
||||
exit 1
|
||||
}
|
||||
echo "GPU detected:"
|
||||
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||
|
||||
- name: Download test binary
|
||||
uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: gpu-test-binary
|
||||
|
||||
- name: Download ONNX Runtime (GPU/CUDA)
|
||||
uses: kreuzberg-dev/actions/setup-onnx-runtime-gpu@v1
|
||||
with:
|
||||
version: ${{ env.ORT_VERSION }}
|
||||
|
||||
- name: Setup PaddleOCR models
|
||||
uses: ./.github/actions/setup-paddle-ocr-models
|
||||
|
||||
- name: Clear stale layout model cache (self-hosted runner persistence)
|
||||
run: |
|
||||
rm -rf "$HOME/.cache/kreuzberg/layout"
|
||||
echo "Cleared layout model cache"
|
||||
|
||||
- name: Run GPU tests
|
||||
run: |
|
||||
chmod +x gpu-acceleration-test
|
||||
./gpu-acceleration-test --ignored --nocapture
|
||||
env:
|
||||
RUST_LOG: "kreuzberg=debug"
|
||||
TEST_DOCUMENTS_DIR: ${{ github.workspace }}/test_documents
|
||||
107
.github/workflows/ci-lint.yaml
vendored
Normal file
107
.github/workflows/ci-lint.yaml
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
name: CI Lint
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-lint-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ubuntu-24.04-arm
|
||||
timeout-minutes: 60
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: lint
|
||||
|
||||
- name: Setup Python
|
||||
uses: kreuzberg-dev/actions/setup-python-env@v1
|
||||
with:
|
||||
python-version: "3.13"
|
||||
cache-prefix: lint-py
|
||||
install-command: "uv sync --group dev --no-install-project --no-install-workspace --frozen"
|
||||
|
||||
- name: Setup Node
|
||||
uses: kreuzberg-dev/actions/setup-node-workspace@v1
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v6
|
||||
with:
|
||||
go-version: "1.26"
|
||||
|
||||
- name: Setup Java
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
distribution: temurin
|
||||
java-version: "25"
|
||||
|
||||
- name: Setup Elixir
|
||||
uses: kreuzberg-dev/actions/setup-elixir@v1
|
||||
|
||||
- name: Setup Ruby
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: "3.4"
|
||||
bundler-cache: true
|
||||
working-directory: packages/ruby
|
||||
|
||||
- name: Setup PHP
|
||||
uses: kreuzberg-dev/actions/setup-php@v1
|
||||
|
||||
- name: Setup .NET
|
||||
uses: actions/setup-dotnet@v5
|
||||
with:
|
||||
dotnet-version: "10.0.x"
|
||||
|
||||
- name: Setup R
|
||||
uses: kreuzberg-dev/actions/setup-r@v1
|
||||
with:
|
||||
r-version: "release"
|
||||
install-deps: "false"
|
||||
|
||||
- name: Install Task
|
||||
uses: kreuzberg-dev/actions/install-task@v1
|
||||
|
||||
- name: Setup Helm
|
||||
uses: azure/setup-helm@v5
|
||||
|
||||
- name: Setup kubeconform
|
||||
uses: bmuschko/setup-kubeconform@v1
|
||||
|
||||
- name: Install alef CLI
|
||||
uses: kreuzberg-dev/actions/install-alef@v1
|
||||
|
||||
- name: Run all prek hooks
|
||||
uses: j178/prek-action@v2
|
||||
with:
|
||||
cache: false
|
||||
extra-args: --all-files
|
||||
|
||||
- name: Validate C header
|
||||
shell: bash
|
||||
run: |
|
||||
HEADER="crates/kreuzberg-ffi/include/kreuzberg.h"
|
||||
if [ ! -f "$HEADER" ]; then
|
||||
echo "::error::C header not found at $HEADER — run 'task alef:generate'"
|
||||
exit 1
|
||||
fi
|
||||
echo "C header verified at $HEADER"
|
||||
79
.github/workflows/ci-mobile.yaml
vendored
Normal file
79
.github/workflows/ci-mobile.yaml
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
name: CI Mobile
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/dart/**"
|
||||
- "packages/swift/**"
|
||||
- "packages/kotlin-android/**"
|
||||
- ".github/workflows/ci-mobile.yaml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/dart/**"
|
||||
- "packages/swift/**"
|
||||
- "packages/kotlin-android/**"
|
||||
- ".github/workflows/ci-mobile.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-mobile-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
# Mobile feature subsets (Android drops ORT-requiring features) leave some
|
||||
# functions only used in the full-feature graph; -A dead_code keeps the
|
||||
# cross-compile check honest about other classes of warnings without choking
|
||||
# on these.
|
||||
RUSTFLAGS: "-D warnings -A dead_code -A unpredictable-function-pointer-comparisons -A mismatched-lifetime-syntaxes"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
android-check:
|
||||
name: Android cargo check (${{ matrix.abi }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
abi: [arm64-v8a, x86_64]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: ci-mobile-android-${{ matrix.abi }}
|
||||
- uses: kreuzberg-dev/actions/setup-android-ndk@v1
|
||||
- name: cargo ndk check kreuzberg-dart
|
||||
run: cargo ndk --target ${{ matrix.abi }} --platform 21 -- check -p kreuzberg-dart
|
||||
- name: cargo ndk check kreuzberg-ffi
|
||||
run: cargo ndk --target ${{ matrix.abi }} --platform 21 -- check -p kreuzberg-ffi
|
||||
|
||||
ios-check:
|
||||
name: iOS cargo check (${{ matrix.target }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: macos-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
target: [aarch64-apple-ios, aarch64-apple-ios-sim]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
target: ${{ matrix.target }}
|
||||
cache-key-prefix: ci-mobile-ios-${{ matrix.target }}
|
||||
- name: cargo check kreuzberg-dart
|
||||
run: cargo check -p kreuzberg-dart --target ${{ matrix.target }}
|
||||
- name: cargo check kreuzberg-swift
|
||||
run: cargo check -p kreuzberg-swift --target ${{ matrix.target }}
|
||||
103
.github/workflows/ci-rust.yaml
vendored
Normal file
103
.github/workflows/ci-rust.yaml
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
name: CI Rust
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/ci-rust.yaml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/ci-rust.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-rust-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
RUST_MIN_STACK: 16777216
|
||||
ORT_VERSION: "1.24.2"
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
BUILD_PROFILE: "ci"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
rust:
|
||||
name: Rust (${{ matrix.os }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-24.04-arm
|
||||
- os: macos-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Free disk space
|
||||
if: runner.os == 'Linux'
|
||||
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
|
||||
with:
|
||||
show-initial: "false"
|
||||
show-final: "true"
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: rust-${{ matrix.os }}
|
||||
use-sccache: "true"
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup OpenSSL
|
||||
uses: kreuzberg-dev/actions/setup-openssl@v1
|
||||
|
||||
- name: Setup ONNX Runtime
|
||||
uses: ./.github/actions/setup-onnx-runtime
|
||||
with:
|
||||
ort-version: ${{ env.ORT_VERSION }}
|
||||
|
||||
- name: Setup Tesseract cache
|
||||
uses: ./.github/actions/setup-tesseract-cache
|
||||
with:
|
||||
label: ${{ matrix.os }}
|
||||
|
||||
- name: Install Task
|
||||
uses: kreuzberg-dev/actions/install-task@v1
|
||||
|
||||
- name: Run clippy
|
||||
run: cargo clippy --workspace --exclude kreuzberg-ffi --exclude kreuzberg-py --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm --exclude kreuzberg-dart --exclude kreuzberg-swift --exclude kreuzberg_nif -- -D warnings
|
||||
shell: bash
|
||||
|
||||
- name: Run tests
|
||||
run: task rust:test:ci
|
||||
shell: bash
|
||||
env:
|
||||
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH || '' }}
|
||||
DYLD_LIBRARY_PATH: ${{ env.DYLD_LIBRARY_PATH || '' }}
|
||||
DYLD_FALLBACK_LIBRARY_PATH: ${{ env.DYLD_FALLBACK_LIBRARY_PATH || '' }}
|
||||
|
||||
- name: Check no-default-features
|
||||
run: cargo check -p kreuzberg --no-default-features
|
||||
shell: bash
|
||||
1303
.github/workflows/profiling.yaml
vendored
Normal file
1303
.github/workflows/profiling.yaml
vendored
Normal file
File diff suppressed because it is too large
Load Diff
262
.github/workflows/publish-docker.yaml
vendored
Normal file
262
.github/workflows/publish-docker.yaml
vendored
Normal file
@@ -0,0 +1,262 @@
|
||||
name: Publish Docker Images
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "Release tag to build (e.g., v4.3.6)"
|
||||
required: true
|
||||
type: string
|
||||
dry_run:
|
||||
description: "Prepare artifacts without publishing"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
ref:
|
||||
description: "Git ref (branch, tag, or commit) to build; defaults to the tag"
|
||||
required: false
|
||||
type: string
|
||||
force_republish:
|
||||
description: "Force re-publish even if artifacts already exist"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
release:
|
||||
types: [published]
|
||||
repository_dispatch:
|
||||
types: [publish-docker]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ (github.event_name == 'workflow_dispatch' && (github.event.inputs.ref || github.event.inputs.tag)) || github.ref || github.run_id }}
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ORT_VERSION: "1.24.2"
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
prepare:
|
||||
name: Prepare metadata
|
||||
if: ${{ github.event_name != 'release' || !github.event.release.prerelease }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
outputs:
|
||||
tag: ${{ steps.meta.outputs.tag }}
|
||||
version: ${{ steps.meta.outputs.version }}
|
||||
ref: ${{ steps.meta.outputs.ref }}
|
||||
dry_run: ${{ steps.meta.outputs.dry_run }}
|
||||
force_republish: ${{ steps.meta.outputs.force_republish }}
|
||||
checkout_ref: ${{ steps.meta.outputs.checkout_ref }}
|
||||
target_sha: ${{ steps.meta.outputs.target_sha }}
|
||||
is_tag: ${{ steps.meta.outputs.is_tag }}
|
||||
release_docker: ${{ steps.meta.outputs.release_docker }}
|
||||
steps:
|
||||
- name: Checkout code (default)
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Resolve release metadata
|
||||
id: meta
|
||||
uses: kreuzberg-dev/actions/prepare-release-metadata@v1
|
||||
with:
|
||||
tag: ${{ inputs.tag }}
|
||||
ref: ${{ inputs.ref }}
|
||||
targets: docker
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
force-republish: ${{ inputs.force_republish }}
|
||||
|
||||
- name: Re-checkout at target ref
|
||||
if: ${{ steps.meta.outputs.checkout_ref != '' }}
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: ${{ steps.meta.outputs.checkout_ref }}
|
||||
fetch-depth: 0
|
||||
submodules: recursive
|
||||
|
||||
- name: Show metadata
|
||||
env:
|
||||
META_TAG: ${{ steps.meta.outputs.tag }}
|
||||
META_VERSION: ${{ steps.meta.outputs.version }}
|
||||
META_REF: ${{ steps.meta.outputs.ref }}
|
||||
META_DRY_RUN: ${{ steps.meta.outputs.dry_run }}
|
||||
META_FORCE_REPUBLISH: ${{ steps.meta.outputs.force_republish }}
|
||||
META_CHECKOUT_REF: ${{ steps.meta.outputs.checkout_ref }}
|
||||
META_TARGET_SHA: ${{ steps.meta.outputs.target_sha }}
|
||||
META_IS_TAG: ${{ steps.meta.outputs.is_tag }}
|
||||
META_RELEASE_DOCKER: ${{ steps.meta.outputs.release_docker }}
|
||||
run: |
|
||||
{
|
||||
echo "## Release Metadata"
|
||||
echo "- **Tag**: \`$META_TAG\`"
|
||||
echo "- **Version**: \`$META_VERSION\`"
|
||||
echo "- **Ref**: \`$META_REF\`"
|
||||
echo "- **Dry Run**: \`$META_DRY_RUN\`"
|
||||
echo "- **Force Republish**: \`$META_FORCE_REPUBLISH\`"
|
||||
echo "- **Checkout Ref**: \`$META_CHECKOUT_REF\`"
|
||||
echo "- **Target SHA**: \`$META_TARGET_SHA\`"
|
||||
echo "- **Is Tag**: \`$META_IS_TAG\`"
|
||||
echo "- **Release Docker**: \`$META_RELEASE_DOCKER\`"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
check-docker:
|
||||
name: Check if Docker image tag exists
|
||||
needs: prepare
|
||||
if: ${{ needs.prepare.outputs.release_docker == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: read
|
||||
outputs:
|
||||
core_exists: ${{ steps.core.outputs.exists }}
|
||||
full_exists: ${{ steps.full.outputs.exists }}
|
||||
cli_exists: ${{ steps.cli.outputs.exists }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: ${{ needs.prepare.outputs.tag }}
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
uses: docker/login-action@v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Check core image tag
|
||||
id: core
|
||||
env:
|
||||
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}-core
|
||||
SUMMARY_LABEL: core
|
||||
run: scripts/publish/check-docker-tag.sh
|
||||
|
||||
- name: Check full image tag
|
||||
id: full
|
||||
env:
|
||||
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}
|
||||
SUMMARY_LABEL: full
|
||||
run: scripts/publish/check-docker-tag.sh
|
||||
|
||||
- name: Check CLI image tag
|
||||
id: cli
|
||||
env:
|
||||
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg-cli:${{ needs.prepare.outputs.version }}
|
||||
SUMMARY_LABEL: cli
|
||||
run: scripts/publish/check-docker-tag.sh
|
||||
|
||||
publish-docker:
|
||||
name: Publish Docker image (${{ matrix.variant }})
|
||||
needs:
|
||||
- prepare
|
||||
- check-docker
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 360
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- variant: core
|
||||
dockerfile: docker/Dockerfile.core
|
||||
image: ghcr.io/kreuzberg-dev/kreuzberg
|
||||
tag_suffix: "-core"
|
||||
extra_tag: "core"
|
||||
- variant: full
|
||||
dockerfile: docker/Dockerfile.full
|
||||
image: ghcr.io/kreuzberg-dev/kreuzberg
|
||||
tag_suffix: ""
|
||||
extra_tag: "latest"
|
||||
- variant: cli
|
||||
dockerfile: docker/Dockerfile.cli
|
||||
image: ghcr.io/kreuzberg-dev/kreuzberg-cli
|
||||
tag_suffix: ""
|
||||
extra_tag: "latest"
|
||||
if: ${{ needs.prepare.outputs.release_docker == 'true' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: ${{ needs.prepare.outputs.checkout_ref }}
|
||||
fetch-depth: 0
|
||||
submodules: recursive
|
||||
|
||||
- name: Free up disk space
|
||||
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
|
||||
|
||||
- name: Ensure target commit
|
||||
if: ${{ needs.prepare.outputs.target_sha != '' }}
|
||||
run: git checkout --progress --force ${{ needs.prepare.outputs.target_sha }}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v4
|
||||
|
||||
- name: Skip because tag already exists
|
||||
if: ${{ needs.prepare.outputs.force_republish != 'true' && ((matrix.variant == 'core' && needs.check-docker.outputs.core_exists == 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists == 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists == 'true')) }}
|
||||
run: echo "Docker tag already exists for variant ${{ matrix.variant }}; skipping publish." >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Build AMD64 test image
|
||||
if: ${{ needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true') }}
|
||||
run: docker build -f ${{ matrix.dockerfile }} --build-arg ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }} -t kreuzberg-publish:${{ matrix.variant }}-test .
|
||||
|
||||
- name: Run Docker tests
|
||||
if: ${{ needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true') }}
|
||||
run: python3 scripts/ci/docker/test_docker.py --image kreuzberg-publish:${{ matrix.variant }}-test --variant ${{ matrix.variant }} --verbose
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
|
||||
uses: docker/login-action@v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract Docker metadata
|
||||
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
|
||||
id: docker_meta
|
||||
uses: docker/metadata-action@v6
|
||||
with:
|
||||
images: ${{ matrix.image }}
|
||||
tags: |
|
||||
type=raw,value=${{ needs.prepare.outputs.version }}${{ matrix.tag_suffix }}
|
||||
type=raw,value=${{ matrix.extra_tag }}
|
||||
|
||||
- name: Build and push image
|
||||
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
|
||||
uses: docker/build-push-action@v7
|
||||
with:
|
||||
context: .
|
||||
file: ${{ matrix.dockerfile }}
|
||||
push: true
|
||||
build-args: |
|
||||
ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }}
|
||||
tags: ${{ steps.docker_meta.outputs.tags }}
|
||||
labels: |
|
||||
${{ steps.docker_meta.outputs.labels }}
|
||||
org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
org.opencontainers.image.description=Kreuzberg document intelligence - ${{ matrix.variant }} variant
|
||||
org.opencontainers.image.licenses=MIT
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max,scope=publish-docker-${{ matrix.variant }}
|
||||
|
||||
- name: Docker dry-run summary
|
||||
if: ${{ needs.prepare.outputs.dry_run == 'true' }}
|
||||
env:
|
||||
IMAGE: ${{ matrix.image }}
|
||||
VERSION: ${{ needs.prepare.outputs.version }}
|
||||
TAG_SUFFIX: ${{ matrix.tag_suffix }}
|
||||
run: scripts/publish/docker/dry-run-summary.sh
|
||||
|
||||
- name: Clean up local Docker images
|
||||
if: ${{ always() }}
|
||||
run: docker rmi kreuzberg-publish:${{ matrix.variant }}-test || true
|
||||
108
.github/workflows/publish-helm.yaml
vendored
Normal file
108
.github/workflows/publish-helm.yaml
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
name: Publish Helm Chart
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "Release tag to build (e.g., v4.3.6)"
|
||||
required: true
|
||||
type: string
|
||||
dry_run:
|
||||
description: "Prepare artifacts without publishing"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
release:
|
||||
types: [published]
|
||||
repository_dispatch:
|
||||
types: [publish-helm]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.tag) || github.ref || github.run_id }}
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish-helm:
|
||||
name: Publish Helm chart to GHCR
|
||||
if: ${{ github.event_name != 'release' || !github.event.release.prerelease }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Resolve version
|
||||
id: meta
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
TAG="${{ inputs.tag }}"
|
||||
elif [[ "${{ github.event_name }}" == "release" ]]; then
|
||||
TAG="${{ github.event.release.tag_name }}"
|
||||
elif [[ "${{ github.event_name }}" == "repository_dispatch" ]]; then
|
||||
TAG="${{ github.event.client_payload.tag }}"
|
||||
fi
|
||||
|
||||
VERSION="${TAG#v}"
|
||||
DRY_RUN="${{ inputs.dry_run || 'false' }}"
|
||||
|
||||
{
|
||||
echo "tag=${TAG}"
|
||||
echo "version=${VERSION}"
|
||||
echo "dry_run=${DRY_RUN}"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
{
|
||||
echo "## Helm Publish Metadata"
|
||||
echo "- **Tag**: \`${TAG}\`"
|
||||
echo "- **Version**: \`${VERSION}\`"
|
||||
echo "- **Dry Run**: \`${DRY_RUN}\`"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Setup Helm
|
||||
uses: azure/setup-helm@v5
|
||||
|
||||
- name: Lint chart
|
||||
run: helm lint --strict charts/kreuzberg/
|
||||
|
||||
- name: Update Chart.yaml version
|
||||
run: |
|
||||
sed -i "s/^version:.*/version: ${{ steps.meta.outputs.version }}/" charts/kreuzberg/Chart.yaml
|
||||
sed -i "s/^appVersion:.*/appVersion: \"${{ steps.meta.outputs.version }}\"/" charts/kreuzberg/Chart.yaml
|
||||
{
|
||||
echo "### Chart.yaml"
|
||||
echo '```yaml'
|
||||
cat charts/kreuzberg/Chart.yaml
|
||||
echo '```'
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Package chart
|
||||
run: |
|
||||
helm package charts/kreuzberg/ --destination .helm-packages/
|
||||
echo "### Packaged" >> "$GITHUB_STEP_SUMMARY"
|
||||
ls -lh .helm-packages/ >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
if: ${{ steps.meta.outputs.dry_run != 'true' }}
|
||||
uses: docker/login-action@v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Push chart to GHCR
|
||||
if: ${{ steps.meta.outputs.dry_run != 'true' }}
|
||||
run: |
|
||||
helm push .helm-packages/kreuzberg-${{ steps.meta.outputs.version }}.tgz oci://ghcr.io/kreuzberg-dev/charts
|
||||
echo "### Published" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "Chart pushed to \`oci://ghcr.io/kreuzberg-dev/charts/kreuzberg:${{ steps.meta.outputs.version }}\`" >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Dry-run summary
|
||||
if: ${{ steps.meta.outputs.dry_run == 'true' }}
|
||||
run: |
|
||||
echo "### Dry Run" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "Would have pushed \`kreuzberg-${{ steps.meta.outputs.version }}.tgz\` to \`oci://ghcr.io/kreuzberg-dev/charts\`" >> "$GITHUB_STEP_SUMMARY"
|
||||
46
.github/workflows/publish-pubdev.yaml
vendored
Normal file
46
.github/workflows/publish-pubdev.yaml
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
name: Publish pub.dev
|
||||
|
||||
# pub.dev OIDC trusted publishing rejects tokens originating from `release`
|
||||
# events; only `push` and `workflow_dispatch` are accepted.
|
||||
#
|
||||
# Because the kreuzberg Dart package embeds platform-specific native binaries
|
||||
# (Android JNI, iOS XCFramework, server libs for linux/macos/windows), we
|
||||
# cannot just rebuild here — those artifacts are produced by the main
|
||||
# `publish.yaml` workflow. Instead, the main workflow's `trigger-pubdev` job
|
||||
# dispatches this workflow with the run_id of the main workflow, and this
|
||||
# workflow downloads the `dart-package-assembled` artifact from that run.
|
||||
#
|
||||
# One-time setup: on pub.dev → kreuzberg package → Admin → Automated publishing,
|
||||
# set the workflow path to `.github/workflows/publish-pubdev.yaml`.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
run_id:
|
||||
description: "GitHub Actions run ID of publish.yaml that produced the dart-package-assembled artifact"
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
actions: read
|
||||
|
||||
env:
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
jobs:
|
||||
publish-pub:
|
||||
name: Publish pub.dev
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: dart-package-assembled
|
||||
path: packages/dart
|
||||
run-id: ${{ inputs.run_id }}
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- uses: kreuzberg-dev/actions/publish-pub@v1
|
||||
with:
|
||||
package-dir: packages/dart
|
||||
2345
.github/workflows/publish.yaml
vendored
Normal file
2345
.github/workflows/publish.yaml
vendored
Normal file
File diff suppressed because it is too large
Load Diff
10
.github/workflows/validate-issues.yml
vendored
Normal file
10
.github/workflows/validate-issues.yml
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
name: Validate Issues
|
||||
|
||||
on:
|
||||
issues:
|
||||
types: [opened, edited]
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
uses: kreuzberg-dev/actions/.github/workflows/reusable-validate-issues.yml@v1
|
||||
secrets: inherit
|
||||
10
.github/workflows/validate-pr.yml
vendored
Normal file
10
.github/workflows/validate-pr.yml
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
name: Validate PR
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, edited, synchronize]
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
uses: kreuzberg-dev/actions/.github/workflows/reusable-validate-pr.yml@v1
|
||||
secrets: inherit
|
||||
296
.gitignore
vendored
Normal file
296
.gitignore
vendored
Normal file
@@ -0,0 +1,296 @@
|
||||
# Build artifacts
|
||||
target/
|
||||
build/
|
||||
dist/
|
||||
# Ad-hoc PDF/image debug binaries
|
||||
/check_pdf_oxide
|
||||
/check_*
|
||||
# Hand-authored WASM JS distribution files (intentionally committed)
|
||||
!crates/kreuzberg-wasm/dist/index.js
|
||||
!crates/kreuzberg-wasm/dist/extraction/files.js
|
||||
!crates/kreuzberg-wasm/dist/ocr/enabler.js
|
||||
.lycheecache
|
||||
dist-musl/
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
*.pdb
|
||||
*.node
|
||||
*.whl
|
||||
*.tar.gz
|
||||
*.gem
|
||||
*.jar
|
||||
wheels/
|
||||
**/bin/
|
||||
!**/src/bin/
|
||||
**/obj/
|
||||
|
||||
# Rust
|
||||
*.rs.bk
|
||||
*.profraw
|
||||
rust-coverage.lcov
|
||||
rust_out
|
||||
|
||||
# Python
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.egg-info/
|
||||
__pycache__/
|
||||
.venv/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
.pytest_cache/
|
||||
.hypothesis/
|
||||
.coverage
|
||||
.coverage*
|
||||
htmlcov/
|
||||
share/python-wheels/
|
||||
prompt_template.egg-info/
|
||||
packages/python/kreuzberg/kreuzberg-cli
|
||||
|
||||
# Node.js / WASM
|
||||
node_modules/
|
||||
*.tsbuildinfo
|
||||
npm-debug.log*
|
||||
pnpm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
# NAPI-RS generated files (index.d.ts is committed for TypeScript type resolution)
|
||||
crates/kreuzberg-node/index.js
|
||||
crates/kreuzberg-node/*.tgz
|
||||
.wrangler/
|
||||
.pnpm-store/
|
||||
|
||||
# Ruby
|
||||
packages/ruby/.bundle/
|
||||
packages/ruby/vendor/
|
||||
!packages/ruby/vendor/rb-sys/
|
||||
!vendor/rb-sys/build/
|
||||
!vendor/rb-sys/build/**
|
||||
*.Cache
|
||||
|
||||
# PHP
|
||||
vendor/
|
||||
php-kreuzberg.ini
|
||||
packages/php/php-kreuzberg.ini
|
||||
packages/php/tests.xml
|
||||
packages/php/var/
|
||||
.php-cs-fixer.cache
|
||||
|
||||
# Java / C# / .NET
|
||||
*.suo
|
||||
*.user
|
||||
*.cscfg
|
||||
*.cobertura.xml
|
||||
TestResults/
|
||||
|
||||
# Generated test apps (e2e-generator --mode published)
|
||||
test_apps/
|
||||
|
||||
# R
|
||||
packages/r/src/.cargo/
|
||||
.Rhistory
|
||||
.RData
|
||||
|
||||
# Elixir
|
||||
.nox/
|
||||
|
||||
# Go
|
||||
packages/go/v*/install
|
||||
|
||||
# C FFI
|
||||
crates/kreuzberg-ffi/tests/c/test_*
|
||||
!crates/kreuzberg-ffi/tests/c/test_*.c
|
||||
crates/kreuzberg-ffi/tests/c/*.o
|
||||
crates/kreuzberg-ffi/tests/c/*.dSYM/
|
||||
e2e/c/test_*
|
||||
!e2e/c/test_*.c
|
||||
e2e/c/*.o
|
||||
e2e/c/*.dSYM/
|
||||
|
||||
# Generated pkg-config
|
||||
*.pc
|
||||
|
||||
# Coverage
|
||||
coverage/
|
||||
coverage.info
|
||||
coverage.json
|
||||
coverage.lcov
|
||||
coverage.out
|
||||
coverage.xml
|
||||
packages/*/.coverage
|
||||
packages/*/coverage.json
|
||||
packages/*/coverage.out
|
||||
packages/*/htmlcov/
|
||||
packages/*/test-results/
|
||||
packages/*/TestResults/
|
||||
|
||||
# IDE / Editor
|
||||
.idea/
|
||||
.vscode/
|
||||
.run/
|
||||
.DS_store
|
||||
|
||||
# CI / Tools
|
||||
.cache/
|
||||
.tmp/
|
||||
*temp/
|
||||
*.temp
|
||||
*.tmp
|
||||
*.log
|
||||
.mvn/
|
||||
.tox/
|
||||
.artifacts/
|
||||
.remote-cache/
|
||||
.rumdl_cache/
|
||||
docker-logs/
|
||||
|
||||
# Benchmarks
|
||||
benchmark-results/
|
||||
benchmark_results.json
|
||||
benchmarks/results/
|
||||
!benchmarks/baselines/
|
||||
aggregated-results/
|
||||
results/
|
||||
reports/
|
||||
tests/benchmarks/results/
|
||||
profiling-results/
|
||||
profiling-results-*/
|
||||
|
||||
# Local perf-iteration artifacts (flamegraphs/ is committed; intermediate outputs are not)
|
||||
profile.json.gz
|
||||
profile.pb.gz
|
||||
*.heap
|
||||
bench/
|
||||
|
||||
# Docs
|
||||
docs/_build/
|
||||
docs/build/
|
||||
docs/benchmarks/charts/
|
||||
site/
|
||||
|
||||
# Elixir build artifacts
|
||||
packages/elixir/_build/
|
||||
packages/elixir/deps/
|
||||
packages/elixir/native/kreuzberg_nif/Cargo.lock
|
||||
e2e/elixir/_build/
|
||||
e2e/elixir/deps/
|
||||
e2e/elixir/mix.lock
|
||||
e2e/elixir/config/
|
||||
test_documents/_build/
|
||||
test_documents/deps/
|
||||
|
||||
# Maven plugin artifacts
|
||||
*.versionsBackup
|
||||
|
||||
# Gradle (Kotlin e2e)
|
||||
e2e/kotlin/.gradle/
|
||||
e2e/kotlin/build/
|
||||
e2e/kotlin_android/.gradle/
|
||||
e2e/kotlin_android/build/
|
||||
|
||||
# Ruby native build artifacts
|
||||
packages/ruby/lib/*.bundle
|
||||
packages/ruby/tmp/
|
||||
|
||||
# Tests
|
||||
test_report.json
|
||||
tests/e2e/logs/
|
||||
tests/e2e/test_report.json
|
||||
temp-test-*
|
||||
packages/csharp/test_html_debug.cs
|
||||
|
||||
# AI / MCP / Agent config
|
||||
.claude/
|
||||
.codex/
|
||||
.gemini/
|
||||
.kreuzberg/
|
||||
.mcp.json
|
||||
.playwright-mcp/
|
||||
.fastembed_cache/
|
||||
*/.fastembed_cache/
|
||||
AGENTS.md
|
||||
CLAUDE.md
|
||||
GEMINI.md
|
||||
.cursorrules
|
||||
.windsurfrules
|
||||
[Tt][Oo][Dd][Oo]*
|
||||
|
||||
# Git worktrees
|
||||
.worktrees/
|
||||
|
||||
# Misc
|
||||
.env
|
||||
output.txt
|
||||
examples.txt
|
||||
requirements.txt
|
||||
docker-compose.yaml
|
||||
/tmp/kreuzberg-docker-test-results.json
|
||||
tools/benchmark-harness/datasets/
|
||||
tools/benchmark-harness/vendored/docling/md/iso_21111_10.md
|
||||
visualizations/
|
||||
|
||||
|
||||
# Additional generated artifacts
|
||||
.alef/
|
||||
*.pyd
|
||||
.gems/
|
||||
vendor/bundle/
|
||||
*.h.bak
|
||||
*.test
|
||||
*.class
|
||||
bin/
|
||||
obj/
|
||||
*.nupkg
|
||||
pkg/
|
||||
|
||||
|
||||
# Local dev artifacts
|
||||
docs/demo-dev.html
|
||||
docs/serve.json
|
||||
demo-loaded.png
|
||||
scratch/
|
||||
|
||||
|
||||
.php-cs-fixer.*
|
||||
|
||||
e2e/zig/.zig-cache/
|
||||
|
||||
# Swift e2e build artifacts (Swift Package Manager)
|
||||
e2e/swift/.build/
|
||||
e2e/swift/.swiftpm/
|
||||
e2e/swift/Package.resolved
|
||||
e2e/swift_e2e/.build/
|
||||
e2e/swift_e2e/.swiftpm/
|
||||
e2e/swift_e2e/Package.resolved
|
||||
|
||||
# Dart e2e build artifacts
|
||||
e2e/dart/.dart_tool/
|
||||
e2e/dart/build/
|
||||
e2e/dart/pubspec.lock
|
||||
|
||||
# Dart FRB codegen cache (incremental build optimization)
|
||||
packages/dart/rust/.frb_codegen_hash
|
||||
|
||||
# Go e2e build artifacts
|
||||
e2e/go/vendor/
|
||||
|
||||
erl_crash.dump
|
||||
|
||||
|
||||
.gradle/
|
||||
|
||||
packages/go/v5/.lib/
|
||||
|
||||
.build/
|
||||
|
||||
packages/r/src/*.o
|
||||
|
||||
# BEGIN ai-rulez (DO NOT EDIT - managed by ai-rulez)
|
||||
.agents/
|
||||
.cursor/
|
||||
.github/agents/
|
||||
.github/commands/
|
||||
.github/copilot-instructions.md
|
||||
.github/skills/
|
||||
# END ai-rulez
|
||||
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
[submodule "test_documents"]
|
||||
path = test_documents
|
||||
url = https://github.com/kreuzberg-dev/test_documents.git
|
||||
157
.golangci.yml
Normal file
157
.golangci.yml
Normal file
@@ -0,0 +1,157 @@
|
||||
version: "2"
|
||||
|
||||
run:
|
||||
timeout: 5m
|
||||
issues-exit-code: 1
|
||||
tests: true
|
||||
concurrency: 4
|
||||
modules-download-mode: readonly
|
||||
allow-serial-runners: false
|
||||
allow-parallel-runners: true
|
||||
|
||||
linters:
|
||||
default: none
|
||||
enable:
|
||||
- errcheck
|
||||
- govet
|
||||
- ineffassign
|
||||
- staticcheck
|
||||
- unused
|
||||
- revive
|
||||
- gocyclo
|
||||
- goconst
|
||||
- gocritic
|
||||
- gosec
|
||||
- misspell
|
||||
- nakedret
|
||||
settings:
|
||||
errcheck:
|
||||
check-type-assertions: true
|
||||
check-blank: true
|
||||
exclude-functions:
|
||||
- (net/http.ResponseWriter).Write
|
||||
- (io.Closer).Close
|
||||
- fmt.Fprintf
|
||||
- fmt.Printf
|
||||
- fmt.Println
|
||||
- os.Setenv
|
||||
- os.Unsetenv
|
||||
goconst:
|
||||
min-len: 3
|
||||
min-occurrences: 3
|
||||
gocyclo:
|
||||
min-complexity: 50
|
||||
gocritic:
|
||||
disabled-checks:
|
||||
- dupSubExpr
|
||||
gosec:
|
||||
excludes:
|
||||
- G101 # ~keep hardcoded credentials check (too many false positives)
|
||||
govet:
|
||||
enable-all: true
|
||||
disable:
|
||||
- shadow
|
||||
misspell:
|
||||
locale: US
|
||||
nakedret:
|
||||
max-func-lines: 30
|
||||
revive:
|
||||
confidence: 0.8
|
||||
severity: warning
|
||||
enable-all-rules: false
|
||||
rules:
|
||||
- name: blank-imports
|
||||
- name: context-keys-type
|
||||
- name: time-naming
|
||||
- name: var-declaration
|
||||
- name: unexported-return
|
||||
- name: errorf
|
||||
- name: context-as-argument
|
||||
- name: dot-imports
|
||||
- name: error-return
|
||||
- name: error-strings
|
||||
- name: error-naming
|
||||
- name: if-return
|
||||
- name: increment-decrement
|
||||
- name: var-naming
|
||||
- name: range
|
||||
- name: receiver-naming
|
||||
- name: indent-error-flow
|
||||
- name: exported
|
||||
disabled: true
|
||||
- name: package-comments
|
||||
disabled: true
|
||||
exclusions:
|
||||
generated: lax
|
||||
rules:
|
||||
- linters:
|
||||
- goconst
|
||||
path: _test\.go
|
||||
- linters:
|
||||
- gocyclo
|
||||
path: _test\.go
|
||||
- linters:
|
||||
- gosec
|
||||
path: _test\.go
|
||||
- linters:
|
||||
- revive
|
||||
path: _test\.go
|
||||
text: "context-as-argument"
|
||||
- linters:
|
||||
- goconst
|
||||
- revive
|
||||
- errcheck
|
||||
- govet
|
||||
path: _test\.go
|
||||
text: "unusedwrite:"
|
||||
- linters:
|
||||
- govet
|
||||
text: "unsafeptr:"
|
||||
- linters:
|
||||
- govet
|
||||
text: "fieldalignment:"
|
||||
- linters:
|
||||
- errcheck
|
||||
path: _test\.go
|
||||
paths:
|
||||
- vendor
|
||||
- build
|
||||
- deployments
|
||||
- third_party$
|
||||
- builtin$
|
||||
- examples$
|
||||
- tools/benchmark-harness/scripts
|
||||
|
||||
issues:
|
||||
max-issues-per-linter: 0
|
||||
max-same-issues: 0
|
||||
uniq-by-line: true
|
||||
new: false
|
||||
exclude:
|
||||
- 'Error return value of `\(\*github\.com/goccy/go-json\.Encoder\)\.Encode` is not checked'
|
||||
- 'Error return value of `w\.Write` is not checked'
|
||||
- 'Error return value of `resp\.Body\.Close` is not checked'
|
||||
- 'Error return value of `res\.Body\.Close` is not checked'
|
||||
- 'Error return value of `r\.Body\.Read` is not checked'
|
||||
- 'Error return value of `os\.Setenv` is not checked'
|
||||
- 'Error return value of `os\.Unsetenv` is not checked'
|
||||
- 'shadow: declaration of "err" shadows declaration'
|
||||
- "unusedwrite: unused write to field"
|
||||
- 'Error return value of `c\.provider\.Delete` is not checked'
|
||||
- 'Error return value of `provider\.Close` is not checked'
|
||||
- 'Error return value of `natsClient\.Close` is not checked'
|
||||
- 'Error return value of `cacheProvider\.Close` is not checked'
|
||||
- 'Error return value of `processor\.Close` is not checked'
|
||||
- 'Error return value of `sub\.Unsubscribe` is not checked'
|
||||
- 'Error return value of `json\.Marshal` is not checked'
|
||||
- 'Error return value of `strconv\.'
|
||||
- 'Error return value of `fmt\.Sscanf` is not checked'
|
||||
- "Error return value is not checked"
|
||||
|
||||
formatters:
|
||||
exclusions:
|
||||
generated: lax
|
||||
paths:
|
||||
- third_party$
|
||||
- builtin$
|
||||
- examples$
|
||||
61
.lychee.toml
Normal file
61
.lychee.toml
Normal file
@@ -0,0 +1,61 @@
|
||||
# Configuration for the lychee link checker.
|
||||
# https://github.com/lycheeverse/lychee
|
||||
|
||||
# root_dir for resolving root-relative links is set dynamically in
|
||||
# `.task/tools/docs.yml` via `--root-dir "$PWD/docs"` — must be absolute,
|
||||
# so we do not hardcode it here.
|
||||
|
||||
# Exclude common patterns that produce false positives
|
||||
exclude = [
|
||||
# Localhost / loopback addresses
|
||||
"localhost",
|
||||
"127\\.0\\.0\\.1",
|
||||
"0\\.0\\.0\\.0",
|
||||
|
||||
# Placeholder / example domains
|
||||
"example\\.com",
|
||||
"example\\.org",
|
||||
|
||||
# Template variables (e.g. ${var}, {{ var }})
|
||||
"\\$\\{",
|
||||
"\\{\\{",
|
||||
|
||||
# GitHub edit links (require auth)
|
||||
"github\\.com/.*/edit/",
|
||||
|
||||
# GitHub raw links for new/renamed files (404 until pushed)
|
||||
"github\\.com/.*/raw/",
|
||||
|
||||
# Docs site self-references (404 until deployed, or unreachable from CI)
|
||||
"kreuzberg\\.dev",
|
||||
|
||||
# PyPI project pages (rate-limited)
|
||||
"pypi\\.org/project/",
|
||||
|
||||
# crates.io (rate-limited)
|
||||
"crates\\.io/crates/",
|
||||
|
||||
# npm (rate-limited)
|
||||
"npmjs\\.com/package/",
|
||||
]
|
||||
|
||||
# Accept these HTTP status codes as valid
|
||||
accept = [200, 204, 301, 302, 429]
|
||||
|
||||
# Connection timeout in seconds
|
||||
timeout = 30
|
||||
|
||||
# Max retries per link
|
||||
max_retries = 3
|
||||
|
||||
# Max concurrent requests
|
||||
max_concurrency = 10
|
||||
|
||||
# Do not check email addresses
|
||||
include_mail = false
|
||||
|
||||
# Do not require HTTPS
|
||||
require_https = false
|
||||
|
||||
# Cache results to speed up repeated runs
|
||||
cache = true
|
||||
28
.oxfmtrc.json
Normal file
28
.oxfmtrc.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"printWidth": 120,
|
||||
"useTabs": true,
|
||||
"tabWidth": 4,
|
||||
"semi": true,
|
||||
"singleQuote": false,
|
||||
"trailingComma": "all",
|
||||
"ignorePatterns": [
|
||||
"**/*.md",
|
||||
"**/*.mdx",
|
||||
"**/dist/**",
|
||||
"**/coverage/**",
|
||||
"**/node_modules/**",
|
||||
"**/pkg/**",
|
||||
"**/.next/**",
|
||||
"**/*.min.js"
|
||||
],
|
||||
"overrides": [
|
||||
{
|
||||
"files": ["**/*.json", "**/*.jsonc"],
|
||||
"options": {
|
||||
"useTabs": true,
|
||||
"tabWidth": 4,
|
||||
"trailingComma": "none"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
56
.oxlintrc.json
Normal file
56
.oxlintrc.json
Normal file
@@ -0,0 +1,56 @@
|
||||
{
|
||||
"plugins": ["typescript"],
|
||||
"env": {
|
||||
"browser": true,
|
||||
"esnext": true,
|
||||
"node": true
|
||||
},
|
||||
"globals": {
|
||||
"describe": "readonly",
|
||||
"it": "readonly",
|
||||
"test": "readonly",
|
||||
"expect": "readonly",
|
||||
"beforeEach": "readonly",
|
||||
"afterEach": "readonly",
|
||||
"beforeAll": "readonly",
|
||||
"afterAll": "readonly",
|
||||
"vi": "readonly"
|
||||
},
|
||||
"categories": {
|
||||
"correctness": "error",
|
||||
"suspicious": "error",
|
||||
"perf": "warn"
|
||||
},
|
||||
"ignorePatterns": [
|
||||
"**/_generated/**",
|
||||
"**/dist/**",
|
||||
"**/coverage/**",
|
||||
"**/node_modules/**",
|
||||
"**/pkg/**",
|
||||
"**/.next/**",
|
||||
"**/*.min.js",
|
||||
"docs/snippets/**/*.ts"
|
||||
],
|
||||
"rules": {
|
||||
"no-console": "off",
|
||||
"no-debugger": "error",
|
||||
"prefer-const": "error",
|
||||
"eqeqeq": ["error", "always", { "null": "ignore" }],
|
||||
"no-unused-vars": "off",
|
||||
"no-underscore-dangle": "off",
|
||||
"@typescript-eslint/no-unused-vars": "warn",
|
||||
"@typescript-eslint/no-explicit-any": "error",
|
||||
"@typescript-eslint/no-non-null-assertion": "error",
|
||||
"@typescript-eslint/consistent-type-imports": "error"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"files": ["**/*.spec.ts", "**/*.spec.tsx", "**/*.test.ts", "**/*.test.tsx", "**/tests/**"],
|
||||
"rules": {
|
||||
"@typescript-eslint/no-explicit-any": "off",
|
||||
"@typescript-eslint/no-non-null-assertion": "off",
|
||||
"no-unused-vars": "off"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
201
.pre-commit-config.yaml
Normal file
201
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,201 @@
|
||||
default_install_hook_types:
|
||||
- pre-commit
|
||||
- commit-msg
|
||||
default_stages:
|
||||
- pre-commit
|
||||
exclude: ^docs/snippets/|vendor/|node_modules/|target/|invalid*|dist/|artifacts/|test_documents/|scripts/ci/|^e2e/|^packages/|^crates/kreuzberg-(py|node|ffi|php|wasm)/|^docs/reference/api-|\.cache/|\.venv/|rust-vendor/
|
||||
repos:
|
||||
- repo: https://github.com/Goldziher/gitfluff
|
||||
rev: v0.8.0
|
||||
hooks:
|
||||
- id: gitfluff-lint
|
||||
args:
|
||||
- --write
|
||||
stages:
|
||||
- commit-msg
|
||||
- repo: https://github.com/kreuzberg-dev/pre-commit-hooks
|
||||
rev: v1.2.3
|
||||
hooks:
|
||||
- id: ai-rulez-generate
|
||||
- id: gh-actions-updater
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-merge-conflict
|
||||
- id: check-added-large-files
|
||||
exclude: uv.lock|docs/benchmarks/app/
|
||||
- id: detect-private-key
|
||||
- id: check-json
|
||||
- id: check-yaml
|
||||
args:
|
||||
- --allow-multiple-documents
|
||||
- --unsafe
|
||||
exclude: ^charts/
|
||||
- id: check-toml
|
||||
- id: check-case-conflict
|
||||
- id: pyproject-fmt
|
||||
- id: ruff
|
||||
- id: ruff-format
|
||||
- id: cargo-sort
|
||||
exclude: ^(crates/kreuzberg-(wasm|node|py|php|ffi)|packages/(dart|swift)/rust)/Cargo\.toml$
|
||||
- id: cargo-fmt
|
||||
args:
|
||||
- --all
|
||||
- id: cargo-check
|
||||
args:
|
||||
- --workspace
|
||||
- --exclude=kreuzberg-ffi
|
||||
- --exclude=kreuzberg-py
|
||||
- --exclude=kreuzberg-node
|
||||
- --exclude=kreuzberg-php
|
||||
- --exclude=kreuzberg-wasm
|
||||
- --exclude=kreuzberg-dart
|
||||
- --exclude=kreuzberg-swift
|
||||
- --all-features
|
||||
- --all-targets
|
||||
- id: cargo-clippy
|
||||
args:
|
||||
- --fix
|
||||
- --allow-dirty
|
||||
- --allow-staged
|
||||
- --workspace
|
||||
- --exclude=kreuzberg-ffi
|
||||
- --exclude=kreuzberg-py
|
||||
- --exclude=kreuzberg-node
|
||||
- --exclude=kreuzberg-php
|
||||
- --exclude=kreuzberg-wasm
|
||||
- --exclude=kreuzberg-dart
|
||||
- --exclude=kreuzberg-swift
|
||||
- --all-features
|
||||
- --all-targets
|
||||
- --
|
||||
- -D
|
||||
- warnings
|
||||
- id: cargo-machete
|
||||
args:
|
||||
- crates/
|
||||
- packages/
|
||||
- tools/
|
||||
exclude: ^e2e/
|
||||
# - id: alef-docs-fresh # TEMP DISABLED — Step 0 lockdown active
|
||||
- id: cargo-deny
|
||||
args:
|
||||
- check
|
||||
- id: shfmt
|
||||
args:
|
||||
- -w
|
||||
- -i
|
||||
- '2'
|
||||
- id: shellcheck
|
||||
args:
|
||||
- -x
|
||||
- id: clang-format
|
||||
args:
|
||||
- --style=file
|
||||
files: ^crates/kreuzberg-ffi/tests/c/
|
||||
- id: clang-tidy
|
||||
files: ^crates/kreuzberg-ffi/tests/c/
|
||||
- id: checkstyle
|
||||
args:
|
||||
- -c
|
||||
- packages/java/checkstyle.xml
|
||||
- -p
|
||||
- packages/java/checkstyle.properties
|
||||
exclude: ^(\.mvn/wrapper/|e2e/|tools/e2e-generator/e2e/|crates/kreuzberg-wasm/e2e/|packages/java/src/)
|
||||
- id: javadoc-lint
|
||||
files: ^packages/java/src/main/.*\.java$
|
||||
- id: java-verify
|
||||
files: ^packages/java/
|
||||
- id: mypy
|
||||
files: ^(lib|src|kreuzberg|tools)/.*\.py$
|
||||
exclude: e2e/|tests/|scripts/
|
||||
- id: pydocstyle
|
||||
files: ^packages/python/.*\.py$
|
||||
exclude: ^packages/python/(tests/|.*_native\.py$)
|
||||
- id: go-fmt
|
||||
exclude: ^(e2e/|test_apps/|tools/)
|
||||
- id: golangci-lint
|
||||
exclude: ^(e2e/|test_apps/|tools/)
|
||||
- id: govulncheck
|
||||
exclude: ^(e2e/|test_apps/|tools/)
|
||||
- id: rubocop
|
||||
files: ^packages/ruby/.*\.rb$
|
||||
- id: rubocop-lint
|
||||
files: ^packages/ruby/.*\.(rb|rbs)$
|
||||
- id: steep
|
||||
files: ^packages/ruby/.*\.(rb|rbs)$
|
||||
- id: dotnet-format
|
||||
files: ^packages/csharp/.*\.cs$
|
||||
- id: dotnet-format-check
|
||||
files: ^packages/csharp/.*\.cs$
|
||||
- id: cs-xmldoc-lint
|
||||
files: ^packages/csharp/Kreuzberg/.*\.cs$
|
||||
- id: php-cs-fixer
|
||||
files: ^packages/php/.*\.php$
|
||||
- id: phpstan
|
||||
files: ^packages/php/.*\.php$
|
||||
- id: phpdoc-lint
|
||||
files: ^packages/php/src/.*\.php$
|
||||
- id: mix-format
|
||||
files: ^packages/elixir/
|
||||
- id: mix-credo
|
||||
files: ^packages/elixir/
|
||||
- id: roxygen2-check
|
||||
files: ^packages/r/R/.*\.R$
|
||||
- id: air-format
|
||||
files: ^packages/r/.*\.[Rr]$
|
||||
- id: air-check
|
||||
files: ^packages/r/.*\.[Rr]$
|
||||
- id: lintr
|
||||
files: ^packages/r/.*\.[Rr]$
|
||||
- id: ktfmt
|
||||
args:
|
||||
- --kotlinlang-style
|
||||
files: ^packages/kotlin-android/.*\.kts?$
|
||||
- id: detekt
|
||||
args:
|
||||
- --build-upon-default-config
|
||||
files: ^packages/kotlin-android/.*\.kts?$
|
||||
- id: ktlint
|
||||
files: ^packages/kotlin-android/.*\.kts?$
|
||||
- id: swift-format
|
||||
files: ^packages/swift/.*\.swift$
|
||||
- id: swiftlint
|
||||
files: ^packages/swift/.*\.swift$
|
||||
- id: dart-format
|
||||
files: ^packages/dart/.*\.dart$
|
||||
- id: dart-analyze
|
||||
files: ^packages/dart/.*\.dart$
|
||||
- id: dart-doc-lint
|
||||
files: ^packages/dart/lib/.*\.dart$
|
||||
exclude: ^packages/dart/lib/src/rust/
|
||||
- id: zig-fmt
|
||||
files: ^packages/zig/.*\.zig$
|
||||
- id: zig-build-check
|
||||
files: ^packages/zig/.*\.zig$
|
||||
- id: helm-lint
|
||||
files: ^charts/.*/(Chart\.yaml|templates/.+\.ya?ml)$
|
||||
- id: kubeconform
|
||||
args:
|
||||
- -strict
|
||||
- -summary
|
||||
files: ^charts/.*/templates/.+\.ya?ml$
|
||||
exclude: ^charts/.*/(values|Chart)\.ya?ml$
|
||||
- id: textlint
|
||||
files: ^docs/.*\.md$
|
||||
exclude: ^docs/(reference/|api-|snippets/|migration/)
|
||||
args:
|
||||
- --fix
|
||||
- --config
|
||||
- .textlintrc.json
|
||||
- id: oxlint
|
||||
- id: oxfmt
|
||||
- id: cppcheck
|
||||
- id: rumdl-fmt
|
||||
exclude: ^templates/
|
||||
- id: cpd
|
||||
# - id: alef-verify # TEMP DISABLED — Step 0 lockdown active
|
||||
# - id: alef-sync-versions # TEMP DISABLED — Step 0 lockdown active
|
||||
- id: actionlint
|
||||
- id: typos
|
||||
args:
|
||||
- --force-exclude
|
||||
47
.rumdl.toml
Normal file
47
.rumdl.toml
Normal file
@@ -0,0 +1,47 @@
|
||||
# rumdl — Rust-based markdown linter
|
||||
# https://github.com/rvben/rumdl
|
||||
|
||||
respect-gitignore = true
|
||||
exclude = [
|
||||
"node_modules",
|
||||
"target",
|
||||
"dist",
|
||||
"vendor",
|
||||
# Snippets carry alef snippet-runner directives (e.g. <!-- snippet:skip -->)
|
||||
# that must sit directly above the fence. Reflowing them with rumdl breaks
|
||||
# the directive contract.
|
||||
"docs/snippets",
|
||||
]
|
||||
|
||||
# MD013: Disable line-length enforcement (tables and code blocks can be long)
|
||||
# MD041: Don't require first line to be an H1
|
||||
# MD046: Disable code block style — Zensical tabs/admonitions indent fenced
|
||||
# blocks, which rumdl misidentifies as indented code blocks
|
||||
# MD051: Disable cross-file link fragment checking (incompatible with Zensical
|
||||
# HTML processing — Zensical strips <span> tags from heading IDs)
|
||||
# MD013: Line length (tables/code can be long)
|
||||
# MD033: Inline HTML (Zensical uses HTML extensively)
|
||||
# MD036: Emphasis as heading (intentional style in docs/READMEs)
|
||||
# MD041: First line H1 not required
|
||||
# MD046: Code block style (Zensical tabs indent fenced blocks)
|
||||
# MD051: Link fragment checking (incompatible with Zensical anchor generation)
|
||||
# MD076: Blank lines between list items (intentional formatting in READMEs)
|
||||
# MD030: Spaces after list markers (MkDocs Material grid cards require 3 spaces)
|
||||
# MD035: Horizontal rule style (grid cards use indented --- as card separators)
|
||||
disable = [
|
||||
"MD012",
|
||||
"MD013",
|
||||
"MD024",
|
||||
"MD030",
|
||||
"MD033",
|
||||
"MD035",
|
||||
"MD036",
|
||||
"MD041",
|
||||
"MD046",
|
||||
"MD051",
|
||||
"MD076",
|
||||
]
|
||||
|
||||
# MD024: Allow duplicate heading names if they are not siblings
|
||||
[MD024]
|
||||
siblings_only = true
|
||||
3
.shellcheckrc
Normal file
3
.shellcheckrc
Normal file
@@ -0,0 +1,3 @@
|
||||
# Disable SC1091: Not following sourced files
|
||||
# This is expected for shared utility libraries in scripts/lib/
|
||||
disable=SC1091
|
||||
89
.task/config/platforms.yml
Normal file
89
.task/config/platforms.yml
Normal file
@@ -0,0 +1,89 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
includes:
|
||||
vars: ./vars.yml
|
||||
|
||||
vars:
|
||||
EXE_EXT:
|
||||
sh: |
|
||||
if [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]] || [[ "$OSTYPE" == "win32" ]]; then
|
||||
echo ".exe"
|
||||
else
|
||||
echo ""
|
||||
fi
|
||||
|
||||
LIB_EXT:
|
||||
sh: |
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
echo "dylib"
|
||||
elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]] || [[ "$OSTYPE" == "win32" ]]; then
|
||||
echo "dll"
|
||||
else
|
||||
echo "so"
|
||||
fi
|
||||
|
||||
LIB_PREFIX:
|
||||
sh: |
|
||||
if [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]] || [[ "$OSTYPE" == "win32" ]]; then
|
||||
echo ""
|
||||
else
|
||||
echo "lib"
|
||||
fi
|
||||
|
||||
RUST_TARGET:
|
||||
sh: |
|
||||
ARCH=$(uname -m)
|
||||
OS_TYPE="$OSTYPE"
|
||||
case "$ARCH" in
|
||||
x86_64|x64)
|
||||
ARCH_STR="x86_64"
|
||||
;;
|
||||
aarch64|arm64)
|
||||
ARCH_STR="aarch64"
|
||||
;;
|
||||
armv7l|armv7)
|
||||
ARCH_STR="armv7"
|
||||
;;
|
||||
*)
|
||||
ARCH_STR="$ARCH"
|
||||
;;
|
||||
esac
|
||||
|
||||
if [[ "$OS_TYPE" == "darwin"* ]]; then
|
||||
echo "${ARCH_STR}-apple-darwin"
|
||||
elif [[ "$OS_TYPE" == "linux-gnu"* ]] || [[ "$OS_TYPE" == "linux"* ]]; then
|
||||
echo "${ARCH_STR}-unknown-linux-gnu"
|
||||
elif [[ "$OS_TYPE" == "msys" ]] || [[ "$OS_TYPE" == "cygwin" ]] || [[ "$OS_TYPE" == "win32" ]]; then
|
||||
echo "${ARCH_STR}-pc-windows-msvc"
|
||||
else
|
||||
echo "${ARCH_STR}-unknown-unknown"
|
||||
fi
|
||||
|
||||
IS_WINDOWS: "{{.IS_WINDOWS}}"
|
||||
IS_MACOS: "{{.IS_MACOS}}"
|
||||
IS_LINUX: "{{.IS_LINUX}}"
|
||||
|
||||
RUBY_FULL_PATH:
|
||||
sh: |
|
||||
if command -v ruby >/dev/null 2>&1; then
|
||||
command -v ruby
|
||||
elif [[ "$OSTYPE" == "darwin"* ]] && [[ -f "/opt/homebrew/opt/ruby/bin/ruby" ]]; then
|
||||
echo "/opt/homebrew/opt/ruby/bin/ruby"
|
||||
else
|
||||
echo "ruby"
|
||||
fi
|
||||
|
||||
CARGO_BIN:
|
||||
sh: command -v cargo 2>/dev/null || echo "cargo"
|
||||
|
||||
RUSTC_BIN:
|
||||
sh: command -v rustc 2>/dev/null || echo "rustc"
|
||||
|
||||
SHELL_EXT:
|
||||
sh: |
|
||||
if [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]] || [[ "$OSTYPE" == "win32" ]]; then
|
||||
echo ".ps1"
|
||||
else
|
||||
echo ".sh"
|
||||
fi
|
||||
118
.task/config/vars.yml
Normal file
118
.task/config/vars.yml
Normal file
@@ -0,0 +1,118 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
vars:
|
||||
# Version extraction from Cargo.toml (workspace.package.version)
|
||||
VERSION:
|
||||
sh: grep -m 1 'version = ' Cargo.toml | sed 's/version = "\(.*\)"/\1/'
|
||||
|
||||
# Build profile (dev/release/ci) - default to release
|
||||
BUILD_PROFILE: '{{.BUILD_PROFILE | default "release"}}'
|
||||
|
||||
# Kreuzberg-specific versions
|
||||
ORT_VERSION: "1.24.1"
|
||||
|
||||
# Toolchain versions
|
||||
GOLANGCI_LINT_VERSION: "latest"
|
||||
|
||||
# Logging
|
||||
RUST_LOG: "info"
|
||||
|
||||
# Root project directories (absolute paths)
|
||||
ROOT: "{{.ROOT_DIR}}"
|
||||
CRATES_DIR: "{{.ROOT_DIR}}/crates"
|
||||
PACKAGES_DIR: "{{.ROOT_DIR}}/packages"
|
||||
SCRIPTS_DIR: "{{.ROOT_DIR}}/scripts"
|
||||
TOOLS_DIR: "{{.ROOT_DIR}}/tools"
|
||||
TARGET_DIR: "{{.ROOT_DIR}}/target"
|
||||
|
||||
# OS Detection
|
||||
OS:
|
||||
sh: |
|
||||
case "$(uname -s 2>/dev/null || echo 'unknown')" in
|
||||
Darwin*)
|
||||
echo "darwin"
|
||||
;;
|
||||
Linux*)
|
||||
echo "linux"
|
||||
;;
|
||||
MINGW*|MSYS*|CYGWIN*)
|
||||
echo "windows"
|
||||
;;
|
||||
*)
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
echo "darwin"
|
||||
elif [[ "$OSTYPE" == "linux-gnu"* ]] || [[ "$OSTYPE" == "linux"* ]]; then
|
||||
echo "linux"
|
||||
elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]] || [[ "$OSTYPE" == "win32" ]]; then
|
||||
echo "windows"
|
||||
else
|
||||
echo "unknown"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# OS Boolean helpers
|
||||
IS_WINDOWS:
|
||||
sh: |
|
||||
if [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]] || [[ "$OSTYPE" == "win32" ]]; then
|
||||
echo "true"
|
||||
else
|
||||
echo "false"
|
||||
fi
|
||||
|
||||
IS_MACOS:
|
||||
sh: |
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
echo "true"
|
||||
else
|
||||
echo "false"
|
||||
fi
|
||||
|
||||
IS_LINUX:
|
||||
sh: |
|
||||
if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ "$OSTYPE" == "linux"* ]]; then
|
||||
echo "true"
|
||||
else
|
||||
echo "false"
|
||||
fi
|
||||
|
||||
# Architecture detection
|
||||
ARCH:
|
||||
sh: |
|
||||
ARCH=$(uname -m)
|
||||
case "$ARCH" in
|
||||
x86_64|x64)
|
||||
echo "x86_64"
|
||||
;;
|
||||
aarch64|arm64)
|
||||
echo "arm64"
|
||||
;;
|
||||
armv7l|armv7)
|
||||
echo "armv7"
|
||||
;;
|
||||
armv6l|armv6)
|
||||
echo "armv6"
|
||||
;;
|
||||
i686|i386)
|
||||
echo "i386"
|
||||
;;
|
||||
*)
|
||||
echo "$ARCH"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Number of CPUs available
|
||||
NUM_CPUS:
|
||||
sh: |
|
||||
if command -v nproc >/dev/null 2>&1; then
|
||||
nproc
|
||||
elif [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
sysctl -n hw.ncpu
|
||||
elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]] || [[ "$OSTYPE" == "win32" ]]; then
|
||||
echo "${NUMBER_OF_PROCESSORS:-4}"
|
||||
else
|
||||
echo "4"
|
||||
fi
|
||||
|
||||
MAKE_JOBS: "{{.NUM_CPUS}}"
|
||||
20
.task/languages/csharp.yml
Normal file
20
.task/languages/csharp.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update C# dependencies within major versions (dotnet outdated -u --version-lock Minor)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/csharp && dotnet outdated -u --version-lock Minor
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade C# dependencies to latest including breaking changes (dotnet outdated -u)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v dotnet >/dev/null 2>&1 || { echo "Dotnet not found, skipping C# upgrade"; exit 0; }
|
||||
cd packages/csharp/Kreuzberg && dotnet outdated -u
|
||||
ignore_error: false
|
||||
20
.task/languages/dart.yml
Normal file
20
.task/languages/dart.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Dart dependencies within major versions (dart pub upgrade)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/dart && dart pub upgrade
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Dart dependencies to latest including breaking changes (dart pub upgrade --major-versions)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v dart >/dev/null 2>&1 || { echo "Dart not found, skipping Dart upgrade"; exit 0; }
|
||||
cd packages/dart && dart pub upgrade --major-versions
|
||||
ignore_error: false
|
||||
20
.task/languages/elixir.yml
Normal file
20
.task/languages/elixir.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Elixir dependencies within major versions (mix deps.update --all)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/elixir && mix deps.update --all
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Elixir dependencies to latest including breaking changes (mix hex.outdated --all + mix deps.update)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v mix >/dev/null 2>&1 || { echo "Elixir not found, skipping Elixir upgrade"; exit 0; }
|
||||
cd packages/elixir && mix hex.outdated --all || true && mix deps.update --all
|
||||
ignore_error: false
|
||||
20
.task/languages/go.yml
Normal file
20
.task/languages/go.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Go dependencies within major versions (go get -u=patch + go mod tidy)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/go/v5 && go get -u=patch ./... && go mod tidy
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Go dependencies to latest including breaking changes (go get -u + go mod tidy)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v go >/dev/null 2>&1 || { echo "Go not found, skipping Go upgrade"; exit 0; }
|
||||
cd packages/go/v5 && go get -u ./... && go mod tidy
|
||||
ignore_error: false
|
||||
19
.task/languages/java.yml
Normal file
19
.task/languages/java.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Java dependencies within major versions (mvn versions:use-next-versions)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/java && command -v mvn >/dev/null 2>&1 && mvn versions:use-next-versions -DgenerateBackupPoms=false || echo "Maven not found, skipping Java upgrade"
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Java dependencies to latest including breaking changes (mvn versions:use-latest-releases)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/java && command -v mvn >/dev/null 2>&1 && mvn versions:use-latest-releases -DgenerateBackupPoms=false || echo "Maven not found, skipping Java upgrade"
|
||||
ignore_error: false
|
||||
20
.task/languages/kotlin_android.yml
Normal file
20
.task/languages/kotlin_android.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Kotlin/Android dependencies within major versions (gradlew dependencyUpdates)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/kotlin-android && ./gradlew dependencyUpdates
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Kotlin/Android dependencies to latest including breaking changes (gradlew useLatestVersions)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
test -f packages/kotlin-android/gradlew || { echo "Kotlin-Android gradlew not found, skipping upgrade"; exit 0; }
|
||||
cd packages/kotlin-android && ./gradlew useLatestVersions
|
||||
ignore_error: false
|
||||
19
.task/languages/node.yml
Normal file
19
.task/languages/node.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Node.js dependencies within major versions (pnpm up)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
pnpm up
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Node.js dependencies to latest including breaking changes (pnpm up --latest)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
pnpm up --latest
|
||||
ignore_error: false
|
||||
20
.task/languages/php.yml
Normal file
20
.task/languages/php.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update PHP dependencies within major versions (composer update)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/php && composer update
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade PHP dependencies to latest including breaking changes (composer update --with-all-dependencies)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v composer >/dev/null 2>&1 || { echo "Composer not found, skipping PHP upgrade"; exit 0; }
|
||||
cd packages/php && composer update --with-all-dependencies
|
||||
ignore_error: false
|
||||
22
.task/languages/python.yml
Normal file
22
.task/languages/python.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
vars:
|
||||
PYTHON_PKG: "packages/python"
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Python dependencies within major versions (uv sync --upgrade-package)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd {{.PYTHON_PKG}} && uv sync --no-install-project --no-install-workspace --all-extras --upgrade-package "*" --no-prerelease
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Python dependencies to latest including breaking changes (uv sync --upgrade)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd {{.PYTHON_PKG}} && uv sync --no-install-project --no-install-workspace --all-extras --upgrade
|
||||
ignore_error: false
|
||||
20
.task/languages/r.yml
Normal file
20
.task/languages/r.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update R dependencies within major versions (devtools::update_packages(check.built = TRUE))"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/r && Rscript -e "devtools::update_packages(check.built = TRUE)"
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade R dependencies to latest including breaking changes (update.packages())"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v Rscript >/dev/null 2>&1 || { echo "R not found, skipping R upgrade"; exit 0; }
|
||||
cd packages/r && Rscript -e 'options(repos = c(CRAN = "https://cloud.r-project.org")); update.packages(ask = FALSE)'
|
||||
ignore_error: false
|
||||
20
.task/languages/ruby.yml
Normal file
20
.task/languages/ruby.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Ruby dependencies within major versions (bundle update --conservative)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/ruby && bundle update --conservative
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Ruby dependencies to latest including breaking changes (bundle update)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v bundle >/dev/null 2>&1 || { echo "Bundle not found, skipping Ruby upgrade"; exit 0; }
|
||||
cd packages/ruby && bundle update
|
||||
ignore_error: false
|
||||
247
.task/languages/rust.yml
Normal file
247
.task/languages/rust.yml
Normal file
@@ -0,0 +1,247 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
includes:
|
||||
platforms: ../config/platforms.yml
|
||||
|
||||
vars:
|
||||
RUST_LOG: '{{.RUST_LOG | default "info"}}'
|
||||
BUILD_PROFILE: '{{.BUILD_PROFILE | default "release"}}'
|
||||
RUST_BACKTRACE: '{{.RUST_BACKTRACE | default "1"}}'
|
||||
CARGO_TERM_COLOR: "always"
|
||||
|
||||
tasks:
|
||||
install:
|
||||
desc: "Install Rust toolchain and components"
|
||||
cmds:
|
||||
- rustup update stable
|
||||
- rustup component add rustfmt clippy
|
||||
# Windows: install lld-link to avoid PATH conflicts with Git Bash's link.exe
|
||||
- cmd: rustup component add llvm-tools
|
||||
platforms: [windows]
|
||||
- cargo install cargo-llvm-cov --locked
|
||||
- cargo install cargo-edit --locked
|
||||
- cargo --version
|
||||
- rustc --version
|
||||
|
||||
build:
|
||||
desc: "Build all Rust crates with {{.BUILD_PROFILE}} profile"
|
||||
cmds:
|
||||
- task: build:{{.BUILD_PROFILE | default "release"}}
|
||||
|
||||
build:dev:
|
||||
desc: "Build all Rust crates in debug mode"
|
||||
cmds:
|
||||
- cmd: cargo build --workspace --all-features --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm
|
||||
platforms: [linux, darwin]
|
||||
# Note: exclude benchmark-harness on Windows as jemalloc doesn't build with MSVC
|
||||
- cmd: cargo build --workspace --all-features --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm --exclude benchmark-harness
|
||||
platforms: [windows]
|
||||
|
||||
build:release:
|
||||
desc: "Build all Rust crates in release mode"
|
||||
cmds:
|
||||
- cmd: cargo build --release --workspace --all-features --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm
|
||||
platforms: [linux, darwin]
|
||||
- cmd: cargo build --release --workspace --all-features --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm --exclude benchmark-harness
|
||||
platforms: [windows]
|
||||
|
||||
build:ci:
|
||||
desc: "Build for CI with debug info enabled"
|
||||
env:
|
||||
RUSTFLAGS: "-C debuginfo=2"
|
||||
cmds:
|
||||
- cmd: cargo build --release --workspace --all-features --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm
|
||||
platforms: [linux, darwin]
|
||||
- cmd: cargo build --release --workspace --all-features --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm --exclude benchmark-harness
|
||||
platforms: [windows]
|
||||
|
||||
build:profiling:
|
||||
desc: "Build all Rust crates with profiling features (for flamegraph generation)"
|
||||
env:
|
||||
ENABLE_PROFILING: "true"
|
||||
RUSTFLAGS: "-g"
|
||||
cmds:
|
||||
- cargo build --workspace --release --features full,profiling,api,mcp,otel --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm
|
||||
- cargo build --manifest-path tools/benchmark-harness/Cargo.toml --release --features profiling
|
||||
|
||||
cli:build:
|
||||
desc: "Build CLI binary"
|
||||
cmds:
|
||||
- cargo build --release --package kreuzberg-cli {{if .TARGET}}--target {{.TARGET}}{{end}}
|
||||
|
||||
cli:build:dev:
|
||||
desc: "Build CLI binary in debug mode"
|
||||
cmds:
|
||||
- cargo build --package kreuzberg-cli
|
||||
|
||||
cli:install:
|
||||
desc: "Install CLI binary"
|
||||
cmds:
|
||||
- cargo install --path crates/kreuzberg-cli
|
||||
|
||||
ffi:build:
|
||||
desc: "Build FFI library for language bindings"
|
||||
cmds:
|
||||
- cargo build --release --package kreuzberg-ffi {{.CARGO_ARGS | default ""}}
|
||||
|
||||
ffi:build:dev:
|
||||
desc: "Build FFI library in debug mode"
|
||||
cmds:
|
||||
- cargo build --package kreuzberg-ffi {{.CARGO_ARGS | default ""}}
|
||||
|
||||
ffi:build:ci:
|
||||
desc: "Build FFI library in CI mode (with debug info)"
|
||||
env:
|
||||
RUSTFLAGS: "-C debuginfo=2"
|
||||
cmds:
|
||||
- cargo build --release --package kreuzberg-ffi {{.CARGO_ARGS | default ""}}
|
||||
|
||||
wasm:build:all:
|
||||
desc: "Build all WASM targets (web/bundler/nodejs/deno) plus the TypeScript wrapper"
|
||||
dir: crates/kreuzberg-wasm
|
||||
cmds:
|
||||
- pnpm run build:all
|
||||
|
||||
test:
|
||||
desc: "Run all Rust tests"
|
||||
cmds:
|
||||
- cmd: bash scripts/ci/rust/run-unit-tests.sh
|
||||
platforms: [linux, darwin]
|
||||
|
||||
test:ci:
|
||||
desc: "Run Rust tests in CI mode with tessdata setup"
|
||||
env:
|
||||
RUST_BACKTRACE: "{{.RUST_BACKTRACE}}"
|
||||
CARGO_TERM_COLOR: "{{.CARGO_TERM_COLOR}}"
|
||||
cmds:
|
||||
- cmd: bash scripts/ci/rust/run-unit-tests.sh
|
||||
platforms: [linux, darwin]
|
||||
|
||||
test:quick:
|
||||
desc: "Run fast Rust tests (unit tests only)"
|
||||
cmds:
|
||||
- cargo test --lib --workspace --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm
|
||||
|
||||
lint:
|
||||
desc: "Lint Rust code WITH auto-fix (cargo fmt + cargo clippy --fix)"
|
||||
cmds:
|
||||
- cargo fmt --all
|
||||
- cargo clippy --workspace --fix --allow-dirty --allow-staged --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm
|
||||
|
||||
lint:check:
|
||||
desc: "Lint Rust code WITHOUT auto-fix (check-only)"
|
||||
cmds:
|
||||
- cargo fmt --all -- --check
|
||||
- cargo clippy --workspace --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm -- -D warnings
|
||||
|
||||
format:
|
||||
desc: "Format Rust code (with modifications)"
|
||||
cmds:
|
||||
- cargo fmt --all
|
||||
|
||||
format:check:
|
||||
desc: "Check Rust formatting without modifications"
|
||||
cmds:
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
licenses:
|
||||
desc: "Check Rust dependency licenses with cargo-deny"
|
||||
cmds:
|
||||
- cargo deny check licenses
|
||||
|
||||
doc:
|
||||
desc: "Generate Rust documentation and open in browser"
|
||||
cmds:
|
||||
- cargo doc --no-deps --open
|
||||
|
||||
doc:build:
|
||||
desc: "Build Rust documentation (without opening)"
|
||||
cmds:
|
||||
- cargo doc --no-deps
|
||||
|
||||
clean:
|
||||
desc: "Clean Rust build artifacts"
|
||||
cmds:
|
||||
- cargo clean
|
||||
|
||||
cache:cleanup:
|
||||
desc: "Clean up large build artifacts to reduce cache size (for CI)"
|
||||
cmds:
|
||||
- echo "Cleaning up large build artifacts to reduce cache size..."
|
||||
- cmd: find target -type f -name "*.rlib" -size +10M -exec rm -f {} \; 2>/dev/null || true
|
||||
ignore_error: true
|
||||
- cmd: find target -type f -name "*.so" -size +10M -exec rm -f {} \; 2>/dev/null || true
|
||||
ignore_error: true
|
||||
- cmd: find target -type f -name "*.dylib" -size +10M -exec rm -f {} \; 2>/dev/null || true
|
||||
ignore_error: true
|
||||
- cmd: find target -type f -name "*.dll" -size +10M -exec rm -f {} \; 2>/dev/null || true
|
||||
ignore_error: true
|
||||
- cmd: rm -rf target/*/incremental 2>/dev/null || true
|
||||
ignore_error: true
|
||||
- echo "Cleanup completed successfully"
|
||||
- cmd: du -sh target 2>/dev/null || echo "No target directory found"
|
||||
ignore_error: true
|
||||
|
||||
update:
|
||||
desc: "Update Rust dependencies (compatible only)"
|
||||
cmds:
|
||||
- cargo update
|
||||
- cmd: cargo update --manifest-path packages/ruby/ext/kreuzberg_rb/native/Cargo.toml
|
||||
ignore_error: true
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Rust dependencies aggressively (cargo upgrade --incompatible)"
|
||||
cmds:
|
||||
- cargo upgrade --incompatible
|
||||
- cargo update
|
||||
- cmd: cargo upgrade --incompatible --manifest-path packages/ruby/ext/kreuzberg_rb/native/Cargo.toml
|
||||
ignore_error: true
|
||||
- cmd: cargo update --manifest-path packages/ruby/ext/kreuzberg_rb/native/Cargo.toml
|
||||
ignore_error: true
|
||||
|
||||
e2e:format:
|
||||
desc: "Format generated Rust E2E sources"
|
||||
cmds:
|
||||
- cargo fmt --manifest-path e2e/rust/Cargo.toml
|
||||
|
||||
e2e:lint:
|
||||
desc: "Lint Rust E2E crate with clippy"
|
||||
cmds:
|
||||
- cargo clippy --manifest-path e2e/rust/Cargo.toml -- -D warnings
|
||||
|
||||
e2e:test:
|
||||
desc: "Run Rust E2E tests"
|
||||
env:
|
||||
RUST_TEST_THREADS: "1"
|
||||
cmds:
|
||||
- cargo test --manifest-path e2e/rust/Cargo.toml --release
|
||||
|
||||
check:android:
|
||||
desc: "cargo check kreuzberg-dart + kreuzberg-ffi for Android ABIs (requires ANDROID_NDK_HOME and cargo-ndk)"
|
||||
preconditions:
|
||||
- sh: command -v cargo-ndk
|
||||
msg: "cargo-ndk not installed. Run: cargo install cargo-ndk --locked"
|
||||
- sh: 'test -n "${ANDROID_NDK_HOME:-}"'
|
||||
msg: "ANDROID_NDK_HOME is not set. Install Android NDK and export ANDROID_NDK_HOME=<path-to-ndk>"
|
||||
cmds:
|
||||
- cargo ndk --target arm64-v8a --platform 21 -- check -p kreuzberg-dart
|
||||
- cargo ndk --target x86_64 --platform 21 -- check -p kreuzberg-dart
|
||||
- cargo ndk --target arm64-v8a --platform 21 -- check -p kreuzberg-ffi
|
||||
- cargo ndk --target x86_64 --platform 21 -- check -p kreuzberg-ffi
|
||||
|
||||
check:ios:
|
||||
desc: "cargo check kreuzberg-dart + kreuzberg-swift for iOS targets (macOS only)"
|
||||
platforms: [darwin]
|
||||
cmds:
|
||||
- rustup target add aarch64-apple-ios aarch64-apple-ios-sim
|
||||
- cargo check -p kreuzberg-dart --target aarch64-apple-ios
|
||||
- cargo check -p kreuzberg-dart --target aarch64-apple-ios-sim
|
||||
- cargo check -p kreuzberg-swift --target aarch64-apple-ios
|
||||
- cargo check -p kreuzberg-swift --target aarch64-apple-ios-sim
|
||||
|
||||
check:mobile:
|
||||
desc: "cargo check Android + iOS mobile targets"
|
||||
cmds:
|
||||
- task: check:android
|
||||
- task: check:ios
|
||||
20
.task/languages/swift.yml
Normal file
20
.task/languages/swift.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "3"
|
||||
internal: true
|
||||
|
||||
tasks:
|
||||
update:
|
||||
desc: "Update Swift dependencies (swift package update)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
cd packages/swift && swift package update
|
||||
ignore_error: false
|
||||
|
||||
upgrade:
|
||||
desc: "Upgrade Swift dependencies (swift package update)"
|
||||
silent: false
|
||||
cmds:
|
||||
- cmd: |
|
||||
command -v swift >/dev/null 2>&1 || { echo "Swift not found, skipping Swift upgrade"; exit 0; }
|
||||
cd packages/swift && swift package update
|
||||
ignore_error: false
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user