This commit is contained in:
304
.ai-rulez/.generated-manifest.json
Normal file
304
.ai-rulez/.generated-manifest.json
Normal file
@@ -0,0 +1,304 @@
|
||||
{
|
||||
"version": "1",
|
||||
"files": [
|
||||
".agents/agents/c-ffi-specialist.md",
|
||||
".agents/agents/code-reviewer.md",
|
||||
".agents/agents/csharp-specialist.md",
|
||||
".agents/agents/dart-specialist.md",
|
||||
".agents/agents/devops-engineer.md",
|
||||
".agents/agents/docs-writer.md",
|
||||
".agents/agents/e2e-generator-engineer.md",
|
||||
".agents/agents/elixir-specialist.md",
|
||||
".agents/agents/extraction-engineer.md",
|
||||
".agents/agents/ffi-engineer.md",
|
||||
".agents/agents/go-specialist.md",
|
||||
".agents/agents/java-specialist.md",
|
||||
".agents/agents/jni-specialist.md",
|
||||
".agents/agents/kotlin-android-specialist.md",
|
||||
".agents/agents/kreuzberg-developer.md",
|
||||
".agents/agents/ocr-engineer.md",
|
||||
".agents/agents/performance-engineer.md",
|
||||
".agents/agents/php-specialist.md",
|
||||
".agents/agents/plugin-engineer.md",
|
||||
".agents/agents/polyglot-architect.md",
|
||||
".agents/agents/python-specialist.md",
|
||||
".agents/agents/r-specialist.md",
|
||||
".agents/agents/release-engineer.md",
|
||||
".agents/agents/ruby-specialist.md",
|
||||
".agents/agents/rust-core-engineer.md",
|
||||
".agents/agents/security-auditor.md",
|
||||
".agents/agents/swift-specialist.md",
|
||||
".agents/agents/test-writer.md",
|
||||
".agents/agents/typescript-specialist.md",
|
||||
".agents/agents/wasm-specialist.md",
|
||||
".agents/agents/zig-specialist.md",
|
||||
".agents/settings.json",
|
||||
".agents/skills/add-language-generator/SKILL.md",
|
||||
".agents/skills/alef/SKILL.md",
|
||||
".agents/skills/alef/references/adapters.md",
|
||||
".agents/skills/alef/references/backends.md",
|
||||
".agents/skills/alef/references/cli-reference.md",
|
||||
".agents/skills/alef/references/configuration.md",
|
||||
".agents/skills/alef/references/designing-alef-toml.md",
|
||||
".agents/skills/alef/references/e2e-testing.md",
|
||||
".agents/skills/alef/references/troubleshooting.md",
|
||||
".agents/skills/api-server-mcp/SKILL.md",
|
||||
".agents/skills/chunking-embeddings/SKILL.md",
|
||||
".agents/skills/common-task-commands/SKILL.md",
|
||||
".agents/skills/create-e2e-fixture/SKILL.md",
|
||||
".agents/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".agents/skills/format-specific-extraction/SKILL.md",
|
||||
".agents/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".agents/skills/quick-start/SKILL.md",
|
||||
".claude/agents/c-ffi-specialist.md",
|
||||
".claude/agents/code-reviewer.md",
|
||||
".claude/agents/csharp-specialist.md",
|
||||
".claude/agents/dart-specialist.md",
|
||||
".claude/agents/devops-engineer.md",
|
||||
".claude/agents/docs-writer.md",
|
||||
".claude/agents/e2e-generator-engineer.md",
|
||||
".claude/agents/elixir-specialist.md",
|
||||
".claude/agents/extraction-engineer.md",
|
||||
".claude/agents/ffi-engineer.md",
|
||||
".claude/agents/go-specialist.md",
|
||||
".claude/agents/java-specialist.md",
|
||||
".claude/agents/jni-specialist.md",
|
||||
".claude/agents/kotlin-android-specialist.md",
|
||||
".claude/agents/kreuzberg-developer.md",
|
||||
".claude/agents/ocr-engineer.md",
|
||||
".claude/agents/performance-engineer.md",
|
||||
".claude/agents/php-specialist.md",
|
||||
".claude/agents/plugin-engineer.md",
|
||||
".claude/agents/polyglot-architect.md",
|
||||
".claude/agents/python-specialist.md",
|
||||
".claude/agents/r-specialist.md",
|
||||
".claude/agents/release-engineer.md",
|
||||
".claude/agents/ruby-specialist.md",
|
||||
".claude/agents/rust-core-engineer.md",
|
||||
".claude/agents/security-auditor.md",
|
||||
".claude/agents/swift-specialist.md",
|
||||
".claude/agents/test-writer.md",
|
||||
".claude/agents/typescript-specialist.md",
|
||||
".claude/agents/wasm-specialist.md",
|
||||
".claude/agents/zig-specialist.md",
|
||||
".claude/settings.json",
|
||||
".claude/skills/add-language-generator/SKILL.md",
|
||||
".claude/skills/alef/SKILL.md",
|
||||
".claude/skills/alef/references/adapters.md",
|
||||
".claude/skills/alef/references/backends.md",
|
||||
".claude/skills/alef/references/cli-reference.md",
|
||||
".claude/skills/alef/references/configuration.md",
|
||||
".claude/skills/alef/references/designing-alef-toml.md",
|
||||
".claude/skills/alef/references/e2e-testing.md",
|
||||
".claude/skills/alef/references/troubleshooting.md",
|
||||
".claude/skills/api-server-mcp/SKILL.md",
|
||||
".claude/skills/chunking-embeddings/SKILL.md",
|
||||
".claude/skills/common-task-commands/SKILL.md",
|
||||
".claude/skills/create-e2e-fixture/SKILL.md",
|
||||
".claude/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".claude/skills/format-specific-extraction/SKILL.md",
|
||||
".claude/skills/iterate/SKILL.md",
|
||||
".claude/skills/parallelize/SKILL.md",
|
||||
".claude/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".claude/skills/quick-start/SKILL.md",
|
||||
".codex/agents/c-ffi-specialist.toml",
|
||||
".codex/agents/code-reviewer.toml",
|
||||
".codex/agents/csharp-specialist.toml",
|
||||
".codex/agents/dart-specialist.toml",
|
||||
".codex/agents/devops-engineer.toml",
|
||||
".codex/agents/docs-writer.toml",
|
||||
".codex/agents/e2e-generator-engineer.toml",
|
||||
".codex/agents/elixir-specialist.toml",
|
||||
".codex/agents/extraction-engineer.toml",
|
||||
".codex/agents/ffi-engineer.toml",
|
||||
".codex/agents/go-specialist.toml",
|
||||
".codex/agents/java-specialist.toml",
|
||||
".codex/agents/jni-specialist.toml",
|
||||
".codex/agents/kotlin-android-specialist.toml",
|
||||
".codex/agents/kreuzberg-developer.toml",
|
||||
".codex/agents/ocr-engineer.toml",
|
||||
".codex/agents/performance-engineer.toml",
|
||||
".codex/agents/php-specialist.toml",
|
||||
".codex/agents/plugin-engineer.toml",
|
||||
".codex/agents/polyglot-architect.toml",
|
||||
".codex/agents/python-specialist.toml",
|
||||
".codex/agents/r-specialist.toml",
|
||||
".codex/agents/release-engineer.toml",
|
||||
".codex/agents/ruby-specialist.toml",
|
||||
".codex/agents/rust-core-engineer.toml",
|
||||
".codex/agents/security-auditor.toml",
|
||||
".codex/agents/swift-specialist.toml",
|
||||
".codex/agents/test-writer.toml",
|
||||
".codex/agents/typescript-specialist.toml",
|
||||
".codex/agents/wasm-specialist.toml",
|
||||
".codex/agents/zig-specialist.toml",
|
||||
".codex/commands/iterate.md",
|
||||
".codex/commands/parallelize.md",
|
||||
".codex/skills/add-language-generator/SKILL.md",
|
||||
".codex/skills/alef/SKILL.md",
|
||||
".codex/skills/alef/references/adapters.md",
|
||||
".codex/skills/alef/references/backends.md",
|
||||
".codex/skills/alef/references/cli-reference.md",
|
||||
".codex/skills/alef/references/configuration.md",
|
||||
".codex/skills/alef/references/designing-alef-toml.md",
|
||||
".codex/skills/alef/references/e2e-testing.md",
|
||||
".codex/skills/alef/references/troubleshooting.md",
|
||||
".codex/skills/api-server-mcp/SKILL.md",
|
||||
".codex/skills/chunking-embeddings/SKILL.md",
|
||||
".codex/skills/common-task-commands/SKILL.md",
|
||||
".codex/skills/create-e2e-fixture/SKILL.md",
|
||||
".codex/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".codex/skills/format-specific-extraction/SKILL.md",
|
||||
".codex/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".codex/skills/quick-start/SKILL.md",
|
||||
".cursor/commands/iterate.md",
|
||||
".cursor/commands/parallelize.md",
|
||||
".cursor/rules/agent-workflow.mdc",
|
||||
".cursor/rules/alef-generated-bindings.mdc",
|
||||
".cursor/rules/alef-workflow.mdc",
|
||||
".cursor/rules/anti-patterns.mdc",
|
||||
".cursor/rules/api-compatibility.mdc",
|
||||
".cursor/rules/async-and-concurrency.mdc",
|
||||
".cursor/rules/atomic-commits.mdc",
|
||||
".cursor/rules/avoid-duplication.mdc",
|
||||
".cursor/rules/batch-operations.mdc",
|
||||
".cursor/rules/bindings.mdc",
|
||||
".cursor/rules/branch-hygiene.mdc",
|
||||
".cursor/rules/cache-and-performance.mdc",
|
||||
".cursor/rules/cgo-bindings.mdc",
|
||||
".cursor/rules/cicd-pipeline-standards.mdc",
|
||||
".cursor/rules/commit-messages.mdc",
|
||||
".cursor/rules/communication-style.mdc",
|
||||
".cursor/rules/complexity-limits.mdc",
|
||||
".cursor/rules/containerization-docker.mdc",
|
||||
".cursor/rules/context-config-loading-precedence.mdc",
|
||||
".cursor/rules/context-crate-structure.mdc",
|
||||
".cursor/rules/context-kreuzberg-brand-and-docs.mdc",
|
||||
".cursor/rules/context-mime-detection-routing.mdc",
|
||||
".cursor/rules/context-owasp-quick-reference.mdc",
|
||||
".cursor/rules/context-polyrepo-structure.mdc",
|
||||
".cursor/rules/context-pre-commit-tooling.mdc",
|
||||
".cursor/rules/context-prek.mdc",
|
||||
".cursor/rules/context-preservation.mdc",
|
||||
".cursor/rules/context-taskfile-structure.mdc",
|
||||
".cursor/rules/context-wasm-constraints.mdc",
|
||||
".cursor/rules/csharp-conventions.mdc",
|
||||
".cursor/rules/dead-code.mdc",
|
||||
".cursor/rules/dependency-awareness.mdc",
|
||||
".cursor/rules/e2e-generator-conventions.mdc",
|
||||
".cursor/rules/elixir-conventions.mdc",
|
||||
".cursor/rules/error-handling.mdc",
|
||||
".cursor/rules/explain-reasoning.mdc",
|
||||
".cursor/rules/ext-php-rs-bindings.mdc",
|
||||
".cursor/rules/extendr-bindings.mdc",
|
||||
".cursor/rules/extraction-quality.mdc",
|
||||
".cursor/rules/extraction-safety.mdc",
|
||||
".cursor/rules/feature-flag-policy.mdc",
|
||||
".cursor/rules/ffi-and-language-interop.mdc",
|
||||
".cursor/rules/fixture-schema-design.mdc",
|
||||
".cursor/rules/gcloud-conventions.mdc",
|
||||
".cursor/rules/generated-code-policy.mdc",
|
||||
".cursor/rules/gh-workflows.mdc",
|
||||
".cursor/rules/go-conventions.mdc",
|
||||
".cursor/rules/incremental-approach.mdc",
|
||||
".cursor/rules/input-validation.mdc",
|
||||
".cursor/rules/java-conventions.mdc",
|
||||
".cursor/rules/least-privilege.mdc",
|
||||
".cursor/rules/magnus-bindings.mdc",
|
||||
".cursor/rules/meaningful-assertions.mdc",
|
||||
".cursor/rules/minimal-changes.mdc",
|
||||
".cursor/rules/monitoring-observability.mdc",
|
||||
".cursor/rules/napi-rs-bindings.mdc",
|
||||
".cursor/rules/no-ai-signatures.mdc",
|
||||
".cursor/rules/ocr-backend-standards.mdc",
|
||||
".cursor/rules/ocr-language-and-config.mdc",
|
||||
".cursor/rules/ocr-performance.mdc",
|
||||
".cursor/rules/ocr-quality.mdc",
|
||||
".cursor/rules/ocr-table-and-hocr.mdc",
|
||||
".cursor/rules/output-awareness.mdc",
|
||||
".cursor/rules/php-conventions.mdc",
|
||||
".cursor/rules/plugin-extensibility.mdc",
|
||||
".cursor/rules/plugin-interface-contract.mdc",
|
||||
".cursor/rules/plugin-registry-and-selection.mdc",
|
||||
".cursor/rules/plugin-testing.mdc",
|
||||
".cursor/rules/pyo3-bindings.mdc",
|
||||
".cursor/rules/python-conventions.mdc",
|
||||
".cursor/rules/python-ffi-plugins.mdc",
|
||||
".cursor/rules/r-conventions.mdc",
|
||||
".cursor/rules/read-before-write.mdc",
|
||||
".cursor/rules/readability-first.mdc",
|
||||
".cursor/rules/ruby-conventions.mdc",
|
||||
".cursor/rules/rust-conventions.mdc",
|
||||
".cursor/rules/rust-polyglot-conventions.mdc",
|
||||
".cursor/rules/rustler-bindings.mdc",
|
||||
".cursor/rules/safe-git-operations.mdc",
|
||||
".cursor/rules/secrets-handling.mdc",
|
||||
".cursor/rules/systematic-debugging.mdc",
|
||||
".cursor/rules/task-automation-build.mdc",
|
||||
".cursor/rules/task-runner.mdc",
|
||||
".cursor/rules/tdd-workflow.mdc",
|
||||
".cursor/rules/test-alongside-code.mdc",
|
||||
".cursor/rules/test-independence.mdc",
|
||||
".cursor/rules/test-naming.mdc",
|
||||
".cursor/rules/testing-anti-patterns.mdc",
|
||||
".cursor/rules/typescript-conventions.mdc",
|
||||
".cursor/rules/verification-before-completion.mdc",
|
||||
".cursor/rules/verify-before-acting.mdc",
|
||||
".cursor/rules/wasm-bindings.mdc",
|
||||
".github/agents/c-ffi-specialist.agent.md",
|
||||
".github/agents/code-reviewer.agent.md",
|
||||
".github/agents/csharp-specialist.agent.md",
|
||||
".github/agents/dart-specialist.agent.md",
|
||||
".github/agents/devops-engineer.agent.md",
|
||||
".github/agents/docs-writer.agent.md",
|
||||
".github/agents/e2e-generator-engineer.agent.md",
|
||||
".github/agents/elixir-specialist.agent.md",
|
||||
".github/agents/extraction-engineer.agent.md",
|
||||
".github/agents/ffi-engineer.agent.md",
|
||||
".github/agents/go-specialist.agent.md",
|
||||
".github/agents/java-specialist.agent.md",
|
||||
".github/agents/jni-specialist.agent.md",
|
||||
".github/agents/kotlin-android-specialist.agent.md",
|
||||
".github/agents/kreuzberg-developer.agent.md",
|
||||
".github/agents/ocr-engineer.agent.md",
|
||||
".github/agents/performance-engineer.agent.md",
|
||||
".github/agents/php-specialist.agent.md",
|
||||
".github/agents/plugin-engineer.agent.md",
|
||||
".github/agents/polyglot-architect.agent.md",
|
||||
".github/agents/python-specialist.agent.md",
|
||||
".github/agents/r-specialist.agent.md",
|
||||
".github/agents/release-engineer.agent.md",
|
||||
".github/agents/ruby-specialist.agent.md",
|
||||
".github/agents/rust-core-engineer.agent.md",
|
||||
".github/agents/security-auditor.agent.md",
|
||||
".github/agents/swift-specialist.agent.md",
|
||||
".github/agents/test-writer.agent.md",
|
||||
".github/agents/typescript-specialist.agent.md",
|
||||
".github/agents/wasm-specialist.agent.md",
|
||||
".github/agents/zig-specialist.agent.md",
|
||||
".github/commands/iterate.md",
|
||||
".github/commands/parallelize.md",
|
||||
".github/copilot-instructions.md",
|
||||
".github/skills/add-language-generator/SKILL.md",
|
||||
".github/skills/alef/SKILL.md",
|
||||
".github/skills/alef/references/adapters.md",
|
||||
".github/skills/alef/references/backends.md",
|
||||
".github/skills/alef/references/cli-reference.md",
|
||||
".github/skills/alef/references/configuration.md",
|
||||
".github/skills/alef/references/designing-alef-toml.md",
|
||||
".github/skills/alef/references/e2e-testing.md",
|
||||
".github/skills/alef/references/troubleshooting.md",
|
||||
".github/skills/api-server-mcp/SKILL.md",
|
||||
".github/skills/chunking-embeddings/SKILL.md",
|
||||
".github/skills/common-task-commands/SKILL.md",
|
||||
".github/skills/create-e2e-fixture/SKILL.md",
|
||||
".github/skills/extraction-pipeline-patterns/SKILL.md",
|
||||
".github/skills/format-specific-extraction/SKILL.md",
|
||||
".github/skills/plugin-architecture-patterns/SKILL.md",
|
||||
".github/skills/quick-start/SKILL.md",
|
||||
".mcp.json",
|
||||
"AGENTS.md",
|
||||
"CLAUDE.md",
|
||||
"GEMINI.md"
|
||||
]
|
||||
}
|
||||
17
.ai-rulez/agents/kreuzberg-developer.md
Normal file
17
.ai-rulez/agents/kreuzberg-developer.md
Normal file
@@ -0,0 +1,17 @@
|
||||
---
|
||||
name: kreuzberg-developer
|
||||
description: General kreuzberg development guidance and cross-cutting concerns
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on kreuzberg:
|
||||
|
||||
1. Rust core is the single source of truth — all business logic in crates/kreuzberg/src/
|
||||
2. Bindings (Python, TypeScript, Ruby, PHP, etc.) are thin wrappers — never duplicate core logic
|
||||
3. Use `task` commands for all operations: `task build` is core-only; use `task build:bindings` or `task build:all` explicitly when bindings are needed
|
||||
4. Build FFI layer first if needed: `task build:bindings`
|
||||
5. For ONNX features: ensure ORT_LIB_LOCATION is set or use download-binaries feature
|
||||
6. All unsafe blocks require SAFETY comments. No .unwrap() in production code.
|
||||
7. Coverage targets: 95% for Rust core, 80% for bindings
|
||||
8. WASM builds are sync-only — implement SyncExtractor for WASM-compatible extractors
|
||||
9. Version in root Cargo.toml is the single source of truth for all binding packages
|
||||
71
.ai-rulez/config.toml
Normal file
71
.ai-rulez/config.toml
Normal file
@@ -0,0 +1,71 @@
|
||||
# AI-Rulez Configuration (migrated to V4 TOML format)
|
||||
# Documentation: https://github.com/Goldziher/ai-rulez
|
||||
|
||||
version = '4.0'
|
||||
name = 'Kreuzberg'
|
||||
description = 'Rust document intelligence library with active Python, TypeScript/Node, Ruby, PHP, Go, Java, C#, Elixir, R, WebAssembly, Dart, Kotlin Android, Swift, Zig, and C FFI bindings'
|
||||
gitignore = true
|
||||
presets = ['claude', 'copilot', 'cursor', 'antigravity', 'codex']
|
||||
builtins = [
|
||||
'rust',
|
||||
'python',
|
||||
'go',
|
||||
'java',
|
||||
'ruby',
|
||||
'php',
|
||||
'csharp',
|
||||
'elixir',
|
||||
'r',
|
||||
'wasm',
|
||||
'pyo3',
|
||||
'napi-rs',
|
||||
'magnus',
|
||||
'ext-php-rs',
|
||||
'rustler',
|
||||
'cgo',
|
||||
'extendr',
|
||||
'default-commands',
|
||||
]
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-core'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/core'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-languages'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/languages'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-cicd'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/cicd'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-infrastructure'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/infrastructure'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[includes]]
|
||||
name = 'kreuzberg-e2e-generator'
|
||||
source = 'https://github.com/kreuzberg-dev/ai-rulez.git'
|
||||
path = 'modules/e2e-generator'
|
||||
merge_strategy = 'local-override'
|
||||
|
||||
[[installed_skills]]
|
||||
name = 'alef'
|
||||
source = 'https://github.com/kreuzberg-dev/alef.git'
|
||||
|
||||
[[mcp_servers]]
|
||||
name = 'playwright'
|
||||
description = 'Playwright browser automation for E2E testing and docs verification'
|
||||
command = 'npx'
|
||||
args = ['-y', '@playwright/mcp@latest']
|
||||
|
||||
[defaults]
|
||||
effort = 'medium'
|
||||
65
.ai-rulez/context/config-loading-precedence.md
Normal file
65
.ai-rulez/context/config-loading-precedence.md
Normal file
@@ -0,0 +1,65 @@
|
||||
---
|
||||
summary: Configuration loading precedence for CLI and server modes
|
||||
---
|
||||
|
||||
# Configuration Loading & Precedence
|
||||
|
||||
## CLI Mode Precedence (highest to lowest)
|
||||
|
||||
1. Individual CLI flags (`--ocr`, `--output-format`, `--chunk`)
|
||||
2. Inline JSON config (`--config-json` or `--config-json-base64`)
|
||||
3. Config file (`--config path.toml`)
|
||||
4. Auto-discovered config (`kreuzberg.{toml,yaml,json}` in cwd/parents)
|
||||
5. Default values
|
||||
|
||||
## Server/MCP Mode Precedence
|
||||
|
||||
1. CLI arguments (`--host`, `--port`)
|
||||
2. Environment variables (`KREUZBERG_HOST`, `KREUZBERG_PORT`)
|
||||
3. Config file `[server]` section
|
||||
4. Defaults (`127.0.0.1:8000`)
|
||||
|
||||
## Config File Discovery
|
||||
|
||||
Searches current directory and parents for `kreuzberg.toml`, `kreuzberg.yaml`, or `kreuzberg.json`. Stops at first match.
|
||||
|
||||
## Inline JSON Config
|
||||
|
||||
Field-level merge (not whole-object replacement):
|
||||
|
||||
```rust
|
||||
fn merge_json_into_config(base: &ExtractionConfig, json: Value) -> Result<ExtractionConfig> {
|
||||
let mut config_json = serde_json::to_value(base)?;
|
||||
// Merge fields from json into config_json
|
||||
serde_json::from_value(merged)?
|
||||
}
|
||||
```
|
||||
|
||||
Use `--config-json-base64` for shell escaping.
|
||||
|
||||
## Config File Formats
|
||||
|
||||
**TOML** (`kreuzberg.toml`):
|
||||
|
||||
```toml
|
||||
use_cache = true
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
languages = ["eng", "deu"]
|
||||
[security_limits]
|
||||
max_archive_size = 524288000
|
||||
```
|
||||
|
||||
**YAML** and **JSON** follow equivalent structure.
|
||||
|
||||
## CLI Flag Overrides
|
||||
|
||||
In `commands.rs`: `apply_extraction_overrides()` applies individual flags on top of merged config.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. CLI flags always win over config file
|
||||
2. JSON merge is field-level, not whole-object
|
||||
3. Auto-discovery stops at first config file found
|
||||
4. `--config-json-base64` for shell-safe JSON passing
|
||||
5. Server config uses `[server]` section + extraction config
|
||||
36
.ai-rulez/context/crate-structure.md
Normal file
36
.ai-rulez/context/crate-structure.md
Normal file
@@ -0,0 +1,36 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
# Crate Structure
|
||||
|
||||
Version source of truth: root `Cargo.toml` `[workspace.package] version`.
|
||||
|
||||
## Workspace crates (`crates/`)
|
||||
|
||||
- `kreuzberg` — core library: extraction engine, MIME detection, plugin system, OCR, chunking, embeddings, API/MCP server
|
||||
- `kreuzberg-cli` — CLI binary; thin wrapper over core with `cli` feature set
|
||||
- `kreuzberg-ffi` — C FFI layer (`#[no_mangle] extern "C"`); opaque handles, cbindgen headers; used by Go, Java, C# bindings
|
||||
- `kreuzberg-node` — NAPI-RS Node.js/TypeScript bindings
|
||||
- `kreuzberg-py` — PyO3 Python bindings
|
||||
- `kreuzberg-php` — ext-php-rs PHP bindings
|
||||
- `kreuzberg-wasm` — wasm-bindgen WASM bindings; uses `wasm-target` feature set
|
||||
- `kreuzberg-paddle-ocr` — PaddleOCR via ONNX Runtime; not available on WASM or Windows
|
||||
- `kreuzberg-tesseract` — Rust bindings for Tesseract OCR
|
||||
|
||||
## Out-of-workspace bindings (`packages/`)
|
||||
|
||||
- `packages/python/` — PyPI (maturin + PyO3)
|
||||
- `packages/typescript/` — npm type declarations
|
||||
- `packages/ruby/` — RubyGems (Magnus); native ext compiled by `rake`
|
||||
- `packages/php/` — Composer (ext-php-rs)
|
||||
- `packages/go/v5/` — Go module; cgo over kreuzberg-ffi
|
||||
- `packages/java/` — Maven; Foreign Function & Memory API over kreuzberg-ffi
|
||||
- `packages/csharp/` — NuGet; P/Invoke over kreuzberg-ffi
|
||||
- `packages/elixir/` — Hex; Rustler NIF (workspace member at `packages/elixir/native/kreuzberg_rustler`)
|
||||
- `packages/r/` — CRAN; extendr (excluded from workspace)
|
||||
|
||||
## Tools (`tools/`)
|
||||
|
||||
- `tools/e2e-generator` — reads JSON fixtures, generates runnable test suites per language into `e2e/`
|
||||
- `tools/benchmark-harness` — criterion-based benchmark runner
|
||||
56
.ai-rulez/context/mime-detection-routing.md
Normal file
56
.ai-rulez/context/mime-detection-routing.md
Normal file
@@ -0,0 +1,56 @@
|
||||
---
|
||||
summary: MIME type detection and extractor routing logic
|
||||
---
|
||||
|
||||
# MIME Detection & Routing
|
||||
|
||||
## Detection Flow
|
||||
|
||||
```text
|
||||
Extension -> EXT_TO_MIME map -> validate -> Registry lookup -> Extractor
|
||||
```
|
||||
|
||||
## Key Functions
|
||||
|
||||
| Function | Location | Purpose |
|
||||
| ------------------------------------ | -------------- | --------------------------------------- |
|
||||
| `detect_mime_type(path, inspect)` | `core/mime.rs` | Extension + optional content inspection |
|
||||
| `detect_mime_type_from_bytes(bytes)` | `core/mime.rs` | Magic number detection (infer crate) |
|
||||
| `validate_mime_type(mime)` | `core/mime.rs` | Check if any extractor supports it |
|
||||
|
||||
## Extension Mapping
|
||||
|
||||
118+ extensions mapped in `EXT_TO_MIME` (`core/mime.rs`). Case-insensitive.
|
||||
|
||||
Key mappings: `.pdf` -> `application/pdf`, `.docx` -> `application/vnd.openxmlformats-officedocument.wordprocessingml.document`, `.xlsx` -> spreadsheet variant, `.png`/`.jpg` -> `image/*`
|
||||
|
||||
## Registry Selection
|
||||
|
||||
```rust
|
||||
// In core/extractor/bytes.rs
|
||||
fn select_extractor_for_mime(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
|
||||
let registry = get_document_extractor_registry();
|
||||
let registry_guard = registry.read()?;
|
||||
registry_guard.get_for_mime_type(mime_type)
|
||||
.ok_or_else(|| KreuzbergError::UnsupportedFormat(mime_type.into()))
|
||||
}
|
||||
```
|
||||
|
||||
Selects highest-priority extractor registered for that MIME type.
|
||||
|
||||
## Adding New MIME Types
|
||||
|
||||
1. Add extension mapping: `m.insert("ext", "application/x-new");` in `core/mime.rs`
|
||||
2. Implement `DocumentExtractor` with `supported_mime_types()` returning the MIME
|
||||
3. Register in `register_default_extractors()`
|
||||
|
||||
## Wildcard Support
|
||||
|
||||
Extractors can register for MIME type families: `"image/*"` matches `image/png`, `image/jpeg`, etc.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. Always `validate_mime_type()` before extraction
|
||||
2. Extension mapping is case-insensitive
|
||||
3. Content inspection (infer crate) is fallback for extension-less files
|
||||
4. Registry validation is final authority on supported types
|
||||
78
.ai-rulez/context/wasm-constraints.md
Normal file
78
.ai-rulez/context/wasm-constraints.md
Normal file
@@ -0,0 +1,78 @@
|
||||
---
|
||||
summary: WASM build constraints and patterns for kreuzberg-wasm crate
|
||||
---
|
||||
|
||||
# WASM Build Constraints
|
||||
|
||||
## Overview
|
||||
|
||||
WASM target in `crates/kreuzberg-wasm/`. Uses wasm-bindgen with sync-only internal APIs.
|
||||
|
||||
## Feature Flags
|
||||
|
||||
```toml
|
||||
[features]
|
||||
wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality", "office"]
|
||||
wasm-threads = ["dep:wasm-bindgen-rayon"] # Optional
|
||||
```
|
||||
|
||||
## Critical Constraints
|
||||
|
||||
### 1. No Tokio Runtime
|
||||
|
||||
All operations must be synchronous internally. Use `#[cfg(not(feature = "tokio-runtime"))]` paths.
|
||||
|
||||
### 2. SyncExtractor Required
|
||||
|
||||
Every WASM-compatible extractor MUST implement `SyncExtractor`:
|
||||
|
||||
```rust
|
||||
impl SyncExtractor for MyExtractor {
|
||||
fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
|
||||
-> Result<ExtractionResult> { /* sync implementation */ }
|
||||
}
|
||||
|
||||
impl DocumentExtractor for MyExtractor {
|
||||
fn as_sync_extractor(&self) -> Option<&dyn SyncExtractor> {
|
||||
Some(self) // MUST return Some for WASM
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. HTML Size Limit
|
||||
|
||||
```rust
|
||||
const MAX_HTML_SIZE: usize = 2 * 1024 * 1024; // 2MB - stack constraint
|
||||
```
|
||||
|
||||
## Build Config
|
||||
|
||||
```toml
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
[profile.release.package.kreuzberg-wasm]
|
||||
opt-level = "z" # Size optimization
|
||||
codegen-units = 1
|
||||
```
|
||||
|
||||
## API Pattern
|
||||
|
||||
```rust
|
||||
#[wasm_bindgen]
|
||||
pub async fn extract_from_bytes(content: Vec<u8>, config: JsValue) -> Result<JsValue, JsValue> {
|
||||
let config: ExtractionConfig = serde_wasm_bindgen::from_value(config)?;
|
||||
let result = extract_bytes_sync(&content, mime_type, &config)?;
|
||||
Ok(serde_wasm_bindgen::to_value(&result)?)
|
||||
}
|
||||
```
|
||||
|
||||
Functions can be `async` for JS compatibility, but internal extraction is sync.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **No tokio** -- all operations synchronous
|
||||
2. **Implement SyncExtractor** for all WASM-compatible extractors
|
||||
3. **HTML limited to 2MB** due to stack constraints
|
||||
4. **Size optimization** via `opt-level = "z"`
|
||||
5. **Feature gate** with `#[cfg(target_arch = "wasm32")]`
|
||||
12
.ai-rulez/domains/document-extraction/DOMAIN.md
Normal file
12
.ai-rulez/domains/document-extraction/DOMAIN.md
Normal file
@@ -0,0 +1,12 @@
|
||||
---
|
||||
description: Document extraction pipeline architecture
|
||||
---
|
||||
|
||||
- Pipeline: file input → MIME detection (magic bytes + extension) → extractor routing → extraction → post-processing → ExtractionResult
|
||||
- Extractors are plugins implementing the Extractor trait: extract(&self, source: &ExtractionSource) → ExtractionResult
|
||||
- Fallback chains: if primary extractor fails, try next in priority order (e.g., native PDF → Tesseract OCR → error)
|
||||
- Cache-first: check extraction cache before running extractors, cache results keyed by content hash
|
||||
- ExtractionResult contains: text content, metadata (page count, language, confidence), optional structured data (tables, images)
|
||||
- Async-first: all extraction paths are async, use spawn_blocking for CPU-bound work (OCR, image processing)
|
||||
- Memory limits: streaming for large files, configurable max file size, depth limits for nested archives
|
||||
- Format coverage: 91+ formats — PDF, DOCX, XLSX, PPTX, HTML, images, email (EML/MSG), archives, plain text
|
||||
@@ -0,0 +1,16 @@
|
||||
---
|
||||
name: extraction-engineer
|
||||
description: Document extraction pipeline development and maintenance
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on document extraction code:
|
||||
|
||||
1. Key source paths: crates/kreuzberg/src/core/ (extractor.rs, mime.rs, config.rs), crates/kreuzberg/src/extraction/
|
||||
2. The extraction pipeline: Input -> Cache Check -> MIME Detection -> Format Conversion -> Extractor Selection (priority-based) -> Extraction -> Fallback Chain -> Post-Processing -> Caching -> Output
|
||||
3. For MIME detection: use EXT_TO_MIME map + magic bytes fallback via infer crate. Always validate_mime_type() before extraction.
|
||||
4. For caching: keys based on content hash, invalidate on config changes
|
||||
5. For errors: implement fallback chains (try next-priority extractor), preserve partial results, return structured error info
|
||||
6. For new formats: add to EXT_TO_MIME, implement DocumentExtractor trait, register in register_default_extractors()
|
||||
7. Always use SecurityLimits validators for user content (ZipBombValidator, DepthValidator, StringGrowthValidator)
|
||||
8. Run `task test` after changes. Target 95% coverage on core extraction code.
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Follow semantic versioning — breaking changes require major version bump
|
||||
- Document all public API changes in CHANGELOG.md
|
||||
- Maintain backward compatibility for at least one minor version before removing deprecated APIs
|
||||
- All public types must be FFI-friendly or have FFI-compatible equivalents
|
||||
- Version in Cargo.toml is the single source of truth for all binding packages
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- All extraction paths must be fully async using tokio
|
||||
- Never block the async runtime — use spawn_blocking for CPU-intensive work
|
||||
- All public types must be Send + Sync
|
||||
- Use tokio::select! for timeout handling on extraction operations
|
||||
- Cross-platform: test on Linux (amd64, arm64) and macOS at minimum
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Cache keys: content-hash based (hash of file bytes + config), not path-based
|
||||
- Invalidate cache when extraction config changes (output format, OCR settings, etc.)
|
||||
- Check cache before any extraction — cache hits should skip all processing
|
||||
- Concurrent batch processing: use configurable worker pool, default to CPU count
|
||||
- Stream large files instead of loading into memory — use AsyncRead where possible
|
||||
- Monitor cache hit rates — target >80% for repeated extractions
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- 95% test coverage on core extraction code, 80% on bindings
|
||||
- Test all format categories: text, office, PDF, images, archives, markup
|
||||
- Test corrupted/malformed documents — extraction must fail gracefully, never panic
|
||||
- Benchmark extraction speeds per format — track regressions in CI
|
||||
- Test both success and error paths for every extractor
|
||||
- Use property-based testing for parsers with wide input ranges
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- Always use `SecurityLimits` to cap archive size, compression ratio, file count, and nesting depth for user content. Use `ZipBombValidator` for archive extraction.
|
||||
- Validate MIME type before extraction — never trust file extensions alone
|
||||
- Implement fallback chains: if primary extractor fails, try next-priority extractor
|
||||
- Preserve partial results on failure — return what was extracted with error context
|
||||
- All errors must include: operation name, input description, root cause, and suggestion
|
||||
- Never expose internal file paths or system details in error messages returned to users
|
||||
13
.ai-rulez/domains/ocr-integration/DOMAIN.md
Normal file
13
.ai-rulez/domains/ocr-integration/DOMAIN.md
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
description: OCR backend integration and image processing
|
||||
---
|
||||
|
||||
- Multiple backends: Tesseract (C FFI via leptonica/tesseract-sys), PaddleOCR (ONNX Runtime), Python backends (EasyOCR, Surya) via FFI
|
||||
- Backend selection: priority-based with fallback — Tesseract default, PaddleOCR for CJK, Python backends as fallback
|
||||
- Image preprocessing: deskew, binarization, noise removal, contrast enhancement — applied before OCR
|
||||
- PSM modes: configurable page segmentation (single block, single line, sparse text) per use case
|
||||
- Table detection: identify table regions → cell extraction → row/column reconstruction → Markdown table output
|
||||
- hOCR: parse Tesseract hOCR output for word-level bounding boxes, confidence scores, reading order
|
||||
- Language management: auto-detect document language, load appropriate Tesseract traineddata, support multi-language documents
|
||||
- Caching: cache OCR results by image hash + backend + language + PSM mode
|
||||
- Confidence tracking: per-word and per-page confidence scores, flag low-confidence regions for review
|
||||
16
.ai-rulez/domains/ocr-integration/agents/ocr-engineer.md
Normal file
16
.ai-rulez/domains/ocr-integration/agents/ocr-engineer.md
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
name: ocr-engineer
|
||||
description: OCR pipeline development, backend integration, and table reconstruction
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on OCR code:
|
||||
|
||||
1. Key source paths: crates/kreuzberg/src/ocr/ (processor.rs, tesseract_backend.rs, hocr.rs, cache.rs, language_registry.rs, table/)
|
||||
2. The OCR pipeline: Image Detection -> Preprocessing (denoise, deskew, binarize) -> Backend Selection -> OCR Execution -> hOCR Parsing -> Table Reconstruction -> Caching -> Return
|
||||
3. Backends: Tesseract (default, native C FFI via leptess), PaddleOCR (ONNX via ort), EasyOCR (Python via PyO3)
|
||||
4. For Python backends: use tokio::task::spawn_blocking, minimize GIL hold time with py.allow_threads(), cache Python data in Rust fields
|
||||
5. For table detection: detect via line/cell boundary detection, validate grid structure, OCR each cell, output as markdown
|
||||
6. For language management: validate against LanguageRegistry, check tessdata availability
|
||||
7. Cache OCR results with key = hash(image_bytes + language + config)
|
||||
8. hOCR parsing: use the hocr module to extract word-level bounding boxes and confidence scores
|
||||
@@ -0,0 +1,11 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- Pluggable backend architecture: all backends implement the OcrBackend trait
|
||||
- Backend independence: switching backends must not require API changes
|
||||
- Tesseract is the default backend (native C FFI via leptess)
|
||||
- Python backends (EasyOCR, PaddleOCR): use tokio::task::spawn_blocking, release GIL for Rust work
|
||||
- Graceful degradation: if preferred backend unavailable, fall back to next available
|
||||
- All backends must return structured results with confidence scores
|
||||
- Document installation requirements and troubleshooting for each backend
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
priority: medium
|
||||
---
|
||||
|
||||
- Validate language packs exist before OCR execution — fail fast with helpful message
|
||||
- Support ISO 639 language codes, map to backend-specific formats
|
||||
- Configuration cascade: CLI args > environment > config file > defaults
|
||||
- Provide troubleshooting guides for common issues (missing tessdata, backend not found)
|
||||
- Language pack installation: document per-platform instructions
|
||||
10
.ai-rulez/domains/ocr-integration/rules/ocr-performance.md
Normal file
10
.ai-rulez/domains/ocr-integration/rules/ocr-performance.md
Normal file
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Cache OCR results: key = hash(image_bytes + language + config)
|
||||
- Invalidate cache when OCR config changes (backend, language, PSM mode)
|
||||
- Batch processing: process multiple images concurrently with configurable parallelism
|
||||
- Resource management: limit concurrent OCR operations to avoid memory exhaustion
|
||||
- Performance targets: <2s for single page, <10s for 10-page document
|
||||
- Monitor and log OCR processing times for regression detection
|
||||
10
.ai-rulez/domains/ocr-integration/rules/ocr-quality.md
Normal file
10
.ai-rulez/domains/ocr-integration/rules/ocr-quality.md
Normal file
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Track confidence scores on all OCR results — expose in API
|
||||
- Image preprocessing (denoise, deskew, binarize) should improve accuracy by 10-30%
|
||||
- PSM mode selection: auto-detect layout, allow user override (single block, single line, sparse text, etc.)
|
||||
- Language detection: validate requested languages are available, provide install hints if not
|
||||
- Multi-language support: allow multiple languages per OCR request
|
||||
- Test OCR accuracy against ground-truth documents in CI
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- hOCR parsing: extract word-level bounding boxes, confidence scores, and text content
|
||||
- Preserve spatial relationships from hOCR output for layout reconstruction
|
||||
- Table detection: use cell boundary detection (line detection + intersection analysis)
|
||||
- Validate grid structure before treating detected regions as tables
|
||||
- OCR each cell individually for better accuracy
|
||||
- Convert tables to markdown format with proper column alignment
|
||||
13
.ai-rulez/domains/plugin-system/DOMAIN.md
Normal file
13
.ai-rulez/domains/plugin-system/DOMAIN.md
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
description: Plugin trait system and Python FFI integration
|
||||
---
|
||||
|
||||
- Core traits: Extractor, PostProcessor, MetadataExtractor — each with async extract/process methods returning Result
|
||||
- Discovery: static registration (Rust plugins compiled in) + dynamic discovery (Python plugins via PyO3 FFI)
|
||||
- Priority selection: plugins declare priority per MIME type, registry selects highest-priority match, fallback to next
|
||||
- Registry: PluginRegistry holds all discovered plugins, provides lookup by MIME type, supports hot-reload for Python plugins
|
||||
- Python FFI: Python plugins implement a Python class matching the trait interface, called via PyO3 with GIL management
|
||||
- GIL management: acquire GIL only for Python calls, release immediately after, use py.allow_threads() for Rust-side work
|
||||
- Plugin lifecycle: init → register → validate → ready. Plugins validate their dependencies (e.g., Tesseract binary, Python packages) at startup
|
||||
- Error handling: plugin errors are wrapped in PluginError with source plugin name, converted to ExtractionError at boundary
|
||||
- Testing: test plugins with real files (not mocks), test fallback chains, test Python plugin loading/unloading
|
||||
16
.ai-rulez/domains/plugin-system/agents/plugin-engineer.md
Normal file
16
.ai-rulez/domains/plugin-system/agents/plugin-engineer.md
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
name: plugin-engineer
|
||||
description: Plugin system architecture, registry management, and Python FFI
|
||||
model: haiku
|
||||
---
|
||||
|
||||
When working on the plugin system:
|
||||
|
||||
1. Key source paths: crates/kreuzberg/src/plugins/ (mod.rs, extractor.rs, ocr.rs, postprocessor.rs, validator.rs, registry.rs), crates/kreuzberg-py/src/plugins.rs
|
||||
2. Plugin types: DocumentExtractor, OcrBackend, PostProcessor, Validator — all extend base Plugin trait (Send + Sync required)
|
||||
3. Priority system: 0-255, default 50, custom override > 50, fallback < 50. Registry selects highest priority for MIME type.
|
||||
4. Registries use Arc<RwLock<>> with MIME type indexing for O(log n) lookup
|
||||
5. Python plugins: validate protocol compliance, use py.allow_threads() for expensive Rust ops, tokio::task::spawn_blocking for async calls
|
||||
6. For new plugin types: define trait extending Plugin, create typed registry, add registration functions, implement priority-based selection
|
||||
7. GIL optimization: cache frequently-accessed Python data in Rust fields, measure GIL overhead
|
||||
8. All plugins must handle errors gracefully — return Result, never panic
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: medium
|
||||
---
|
||||
|
||||
- API stability: plugin interfaces are versioned, breaking changes require major version bump
|
||||
- Plugin discovery: support both static (compile-time) and dynamic (runtime) registration
|
||||
- Plugin validation: check capabilities, supported formats, and version compatibility before registration
|
||||
- Plugin chaining: post-processors can be composed in sequence
|
||||
- Configuration: plugins accept typed configuration, validated at registration time
|
||||
- Documentation: every plugin type must have a development guide with examples
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- All plugins must implement the base Plugin trait: Send + Sync + 'static required
|
||||
- Plugin types: DocumentExtractor, OcrBackend, PostProcessor, Validator
|
||||
- Async execution: use async trait methods for non-blocking operations
|
||||
- Lifecycle: init() -> process() -> cleanup(). Init must validate all requirements.
|
||||
- Never panic in plugin code — all errors must be returned as Result
|
||||
- Consistent result format: all extractors return ExtractionResult with text, metadata, and confidence
|
||||
@@ -0,0 +1,12 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
- Separate typed registry per plugin type (ExtractorRegistry, OcrRegistry, etc.)
|
||||
- Thread safety: Arc<RwLock<>> for all registries
|
||||
- Priority system: 0-255, default 50, custom > 50, fallback < 50
|
||||
- Selection: highest priority plugin matching the MIME type wins
|
||||
- MIME type indexing for O(log n) lookup
|
||||
- Conflict resolution: if equal priority, prefer Rust-native over FFI plugins
|
||||
- Dynamic registration: plugins can be added/removed at runtime
|
||||
- Validate plugin before registration (check trait compliance, supported formats)
|
||||
10
.ai-rulez/domains/plugin-system/rules/plugin-testing.md
Normal file
10
.ai-rulez/domains/plugin-system/rules/plugin-testing.md
Normal file
@@ -0,0 +1,10 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- Mock plugin testing: create test doubles for unit tests
|
||||
- Real plugin testing: integration tests with actual backends
|
||||
- Thread safety tests: run concurrent plugin operations to detect race conditions
|
||||
- Performance baselines: measure and track plugin overhead vs direct calls
|
||||
- Test all error paths: invalid input, backend failure, timeout, resource exhaustion
|
||||
- Test plugin lifecycle: register, use, unregister, verify cleanup
|
||||
11
.ai-rulez/domains/plugin-system/rules/python-ffi-plugins.md
Normal file
11
.ai-rulez/domains/plugin-system/rules/python-ffi-plugins.md
Normal file
@@ -0,0 +1,11 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- GIL management: use py.allow_threads() for expensive Rust operations
|
||||
- Cache frequently-accessed Python data in Rust fields to minimize GIL acquisitions
|
||||
- Use tokio::task::spawn_blocking for async calls to Python backends
|
||||
- Python exception translation: convert Python exceptions to Rust errors with full context
|
||||
- Data type mapping: Python str <-> Rust String, Python bytes <-> Rust Vec<u8>, Python dict <-> Rust HashMap
|
||||
- Validate Python plugin protocol compliance on registration
|
||||
- Target GIL overhead: 5-55us per acquisition
|
||||
45
.ai-rulez/ground-truth-generation.md
Normal file
45
.ai-rulez/ground-truth-generation.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Ground Truth Generation
|
||||
|
||||
## Pandoc Commands
|
||||
|
||||
```bash
|
||||
pandoc <source_file> -t gfm --wrap=none -o <gt_file.md>
|
||||
pandoc <source_file> -t plain --wrap=none -o <gt_file.txt>
|
||||
```
|
||||
|
||||
## Artifact Removal
|
||||
|
||||
```bash
|
||||
sed -i '' 's/ {#[^}]*}//g' "$file" # Remove {#id} attributes
|
||||
sed -i '' 's/ {[^}]*}//g' "$file" # Remove {.class} attributes
|
||||
sed -i '' '/^:::/d' "$file" # Remove fenced div markers
|
||||
sed -i '' 's/\\\$/$/g' "$file" # Unescape dollar signs
|
||||
sed -i '' "s/\\\\'/'/g" "$file" # Unescape quotes
|
||||
```
|
||||
|
||||
## Cleanup Rules
|
||||
|
||||
1. Convert ALL HTML to markdown equivalents where possible
|
||||
2. For colspan/rowspan, put content in first cell, leave others empty
|
||||
3. Remove `<!-- -->` comments
|
||||
4. Strip `<u>`, `<sup>`, `<sub>` tags (keep text content)
|
||||
5. Convert `<img>` to ``
|
||||
6. Collapse 3+ consecutive blank lines to 2
|
||||
7. Never use our own extractor output as GT
|
||||
|
||||
## Fixture JSON Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"document": "relative/path/to/source.ext",
|
||||
"file_type": "docx",
|
||||
"file_size": 12345,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": { "description": "...", "source": "pandoc-generated" },
|
||||
"ground_truth": {
|
||||
"text_file": "relative/path/to/gt.txt",
|
||||
"markdown_file": "relative/path/to/gt.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
```
|
||||
34
.ai-rulez/rules/alef-generated-bindings.md
Normal file
34
.ai-rulez/rules/alef-generated-bindings.md
Normal file
@@ -0,0 +1,34 @@
|
||||
---
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Alef-Generated Bindings
|
||||
|
||||
Files in `packages/*/` and binding crates are generated by Alef — DO NOT manually edit.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Check `alef.toml` before editing anything in `packages/*/` or binding crates
|
||||
2. Modify Rust source files or `alef.toml` itself
|
||||
3. Run `task alef:generate` to regenerate all Alef-managed output without formatting (`alef all --clean --format=false`)
|
||||
4. Run `task alef:format` explicitly if Alef post-generation formatting is needed
|
||||
5. Run `task e2e:test` or `task e2e:all` to verify
|
||||
6. Commit Rust source + `alef.toml` + regenerated bindings atomically
|
||||
|
||||
## Rename Mappings (from `alef.toml`)
|
||||
|
||||
- **Go**: `DocumentExtractor` → `Extractor`
|
||||
- All other languages: no renames
|
||||
|
||||
## Freshness Check
|
||||
|
||||
`task alef:generate && git diff --exit-code packages/ crates/kreuzberg-node/ crates/kreuzberg-wasm/ crates/kreuzberg-ffi/`
|
||||
|
||||
## Key `alef.toml` Sections
|
||||
|
||||
- `[crate]` — Rust source files parsed for type/function extraction
|
||||
- `[languages.*]` — per-language output path, module name, rename mappings
|
||||
- `[e2e]` — e2e test generation: fixtures dir, output dir, per-language call overrides
|
||||
- `[readme]` — README template generation per language package
|
||||
|
||||
Canonical e2e tasks are `task e2e:generate`, `task e2e:build`, `task e2e:test`, and `task e2e:all`. Do not add legacy aliases.
|
||||
65
.ai-rulez/rules/feature-flag-policy.md
Normal file
65
.ai-rulez/rules/feature-flag-policy.md
Normal file
@@ -0,0 +1,65 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
# Feature Flag Policy
|
||||
|
||||
All features in `crates/kreuzberg/Cargo.toml`.
|
||||
|
||||
## ORT-Incompatible Targets (WASM, Android x86_64 emulator)
|
||||
|
||||
Only ORT-dependent paths are incompatible. The same paths block both WASM (no native ORT linkage at all) and the `x86_64-linux-android` emulator triple (no pyke prebuilt; `aarch64-linux-android` does ship a prebuilt and gets full ORT):
|
||||
|
||||
- `paddle-ocr` — ONNX Runtime + native C++ deps: not WASM-safe; no Android x86_64 prebuilt
|
||||
- `layout-detection` — depends on ONNX Runtime layout models: not WASM-safe; no Android x86_64 prebuilt
|
||||
- `embeddings` — depends on ONNX Runtime sentence-transformer models: not WASM-safe; no Android x86_64 prebuilt
|
||||
- `auto-rotate` — depends on ONNX Runtime orientation classifier: not WASM-safe; no Android x86_64 prebuilt
|
||||
|
||||
Pure-Rust **type-only** companion features expose the public config/result types for the above without pulling in ORT:
|
||||
|
||||
- `layout-types` — `LayoutDetectionConfig`, `TableModel`, `BBox`, `DetectionResult`, `LayoutClass`, `LayoutDetection`, `RecognizedTable`. `layout-detection` implies `layout-types`.
|
||||
- `auto-rotate-types` — `OrientationResult`. `auto-rotate` implies `auto-rotate-types`.
|
||||
- `embedding-presets` — `EmbeddingPreset` (already existed; pure-Rust preset metadata).
|
||||
|
||||
WASM/Android-safe variants:
|
||||
|
||||
- `ocr` (native) → `ocr-wasm` (uses `tesseract-wasm` + safe image deps) — Android keeps native `ocr`
|
||||
- `excel` (native) → `excel-wasm` (drops `tokio-runtime`) — Android keeps native `excel`
|
||||
- `tree-sitter` (native dlopen) → `tree-sitter-wasm` (statically-linked grammar pack) — Android keeps native `tree-sitter`
|
||||
- `liter-llm` — works on WASM via the upstream `wasm-http` feature; included in `no-ort-target`
|
||||
- `stopwords` — pure-Rust, included in `no-ort-target`
|
||||
- `keywords` — pure-Rust YAKE/RAKE, included in `no-ort-target`
|
||||
|
||||
The `no-ort-target` aggregate is the shared no-ORT base used by both `wasm-target` and `android-target`. `wasm-target = no-ort-target + excel-wasm + tree-sitter-wasm + ocr-wasm`. `android-target = no-ort-target + excel + tree-sitter + ocr + api + mcp`.
|
||||
|
||||
## Experimental (NOT in `full`)
|
||||
|
||||
- `pdf-oxide` — pure-Rust PDF text extraction; opt-in only, excluded from both `full` and `formats`
|
||||
|
||||
## ORT Variants (Mutually Exclusive)
|
||||
|
||||
- `ort-bundled` — downloads official Microsoft ORT binaries; default when OCR/ML features active
|
||||
- `ort-dynamic` — load ORT from system; only when system ORT is guaranteed present
|
||||
|
||||
## Platform-Conditional
|
||||
|
||||
- `kreuzberg-paddle-ocr`, `hf-hub`, `pprof` — excluded on `wasm32`
|
||||
- `ureq`: `rustls` on non-Windows; `native-tls` on Windows
|
||||
- `kreuzberg-ffi` and `kreuzberg-dart` cargo dependencies are target-conditional: `cfg(all(target_os = "android", target_arch = "x86_64"))` selects `android-target`; all other targets (including arm64 Android phones) get the full ORT-enabled feature set.
|
||||
|
||||
## Aggregate Sets
|
||||
|
||||
| Feature | Description |
|
||||
| ---------------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `formats` | All document formats + api/mcp/otel/chunking; no OCR, no ML |
|
||||
| `full` | `formats` + ocr + paddle-ocr + layout + embeddings + tree-sitter + liter-llm; excludes `pdf-oxide` |
|
||||
| `no-ort-target` | Pure-Rust base: every capability that does not depend on ONNX Runtime |
|
||||
| `wasm-target` | `no-ort-target` + excel-wasm + tree-sitter-wasm + ocr-wasm |
|
||||
| `android-target` | `no-ort-target` + excel + tree-sitter + ocr + api + mcp (for x86_64-linux-android emulator) |
|
||||
|
||||
## Build Profiles
|
||||
|
||||
- `release` — LTO thin, codegen-units=1, strip
|
||||
- `profiling` — inherits release, retains debug info
|
||||
- `kreuzberg-wasm` override: `opt-level="z"` (size-optimized)
|
||||
- `sevenz-rust2`, `zip` override: `opt-level=2` (prevents SIGBUS on macOS ARM64)
|
||||
15
.ai-rulez/rules/typescript-conventions.md
Normal file
15
.ai-rulez/rules/typescript-conventions.md
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
priority: high
|
||||
---
|
||||
|
||||
- `strict: true` + `noUncheckedIndexedAccess` in tsconfig, never `any` — use `unknown` with type guards.
|
||||
- ESM imports only, `const` over `let`, `as const` for literals, `interface` over `type` for objects.
|
||||
- `import type` for type-only imports to avoid runtime overhead. Discriminated unions for type-safe state.
|
||||
- Formatting/linting: `oxfmt` + `oxlint`. Type checking: `tsc --noEmit` in CI.
|
||||
- Testing: `vitest` (80%+ coverage). Runtime validation at system boundaries with `zod`.
|
||||
- Error handling: discriminated unions for expected errors, throw only for unexpected.
|
||||
- Package manager: `pnpm` with `pnpm-lock.yaml` committed, build: `tsup` or `esbuild`.
|
||||
- Monorepo: workspace protocol (`workspace:*`), shared tsconfig base, `pnpm-workspace.yaml`.
|
||||
- Node.js: `node:` prefix for core modules, `fetch` over `axios`.
|
||||
- Security: `pnpm audit` for dependency CVE scanning. Zero tolerance for critical/high vulnerabilities.
|
||||
- Anti-patterns: non-null assertions (`!`), type assertions (`as`), `enum` (use unions), `@ts-ignore`.
|
||||
212
.ai-rulez/skills/api-server-mcp/SKILL.md
Normal file
212
.ai-rulez/skills/api-server-mcp/SKILL.md
Normal file
@@ -0,0 +1,212 @@
|
||||
---
|
||||
description: "REST API server and MCP protocol integration"
|
||||
name: api-server-mcp
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# API Server & MCP Protocol
|
||||
|
||||
**Axum server design for document extraction endpoints, middleware, async processing, and Model Context Protocol integration for AI agents**
|
||||
|
||||
## Kreuzberg API Architecture
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/`, `crates/kreuzberg-cli/`
|
||||
|
||||
Kreuzberg provides a dual REST API + MCP server built with Axum + Tokio.
|
||||
|
||||
```text
|
||||
Request Flow:
|
||||
HTTP Client / AI Agent (Claude)
|
||||
|
|
||||
[Transport Layer]
|
||||
├── REST API (Axum HTTP)
|
||||
└── MCP Protocol (HTTP or Stdio)
|
||||
|
|
||||
[Middleware Layer]
|
||||
├── CORS, Request Logging (TraceLayer)
|
||||
├── Request/Response size limits
|
||||
└── Rate limiting (optional)
|
||||
|
|
||||
[Router]
|
||||
├── REST Endpoints
|
||||
│ ├── POST /extract - File upload extraction
|
||||
│ ├── POST /extract-url - URL-based extraction
|
||||
│ ├── GET /formats - List supported formats
|
||||
│ ├── GET /health - Server health check
|
||||
│ ├── POST /batch - Batch document processing
|
||||
│ ├── GET /cache/stats - Cache statistics
|
||||
│ └── DELETE /cache - Clear extraction cache
|
||||
├── MCP Endpoints
|
||||
│ ├── POST /mcp/tools - List available tools
|
||||
│ ├── POST /mcp/tools/call - Call a tool
|
||||
│ ├── GET /mcp/resources - List resources
|
||||
│ ├── GET /mcp/resources/:uri - Read resource
|
||||
│ ├── GET /mcp/prompts - List prompts
|
||||
│ └── GET /mcp/prompts/:name - Get prompt
|
||||
|
|
||||
[Handler / Tool Layer]
|
||||
├── extract_handler / extract_file tool
|
||||
├── batch_handler / batch_extract tool
|
||||
├── health_handler / get_capabilities tool
|
||||
└── format_handler
|
||||
|
|
||||
[Extraction Core]
|
||||
├── Format detection
|
||||
├── Extraction pipeline
|
||||
├── Post-processing (chunking, embeddings)
|
||||
└── Result formatting
|
||||
|
|
||||
JSON Response / MCP ToolResult
|
||||
```
|
||||
|
||||
## Server Setup & Configuration
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/server.rs`
|
||||
|
||||
Server initialization pattern: Create `ApiState` (holds `ExtractionConfig` + `ExtractionCache`), build Axum `Router` with all REST + MCP routes, apply middleware layers (body limits, CORS, tracing), serve via `tokio::net::TcpListener`.
|
||||
|
||||
Key middleware layers applied in order:
|
||||
|
||||
- `DefaultBodyLimit::max(100MB)` + `RequestBodyLimitLayer` -- configurable via env vars
|
||||
- `CorsLayer::permissive()` -- restrict in production via `CORS_ALLOWED_ORIGINS`
|
||||
- `TraceLayer::new_for_http()` -- request/response logging
|
||||
|
||||
## Core REST Handlers
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/handlers.rs`
|
||||
|
||||
| Handler | Method | Description |
|
||||
| --------------------- | ----------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `extract_handler` | POST /extract | Multipart upload: parse file + optional config JSON, check cache, call `extract_bytes()`, cache result |
|
||||
| `extract_url_handler` | POST /extract-url | Fetch URL via reqwest, extract bytes |
|
||||
| `batch_handler` | POST /batch | Parallel extraction with `Semaphore`-limited concurrency (default: CPU count) |
|
||||
| `health_handler` | GET /health | Report status, version, uptime, feature availability (OCR, embeddings), cache stats |
|
||||
| `formats_handler` | GET /formats | Return supported format categories (office, pdf, images, web, email, archives, academic) |
|
||||
| `cache_stats_handler` | GET /cache/stats | Hit/miss counts and hit rate |
|
||||
| `cache_clear_handler` | DELETE /cache | Clear LRU cache |
|
||||
|
||||
## Caching Strategy
|
||||
|
||||
**Location**: `crates/kreuzberg/src/cache/mod.rs`
|
||||
|
||||
LRU cache keyed by `SHA256(file_content)`, stores `Arc<ExtractionResult>`. Default 1000 entries. Thread-safe via `RwLock`. Tracks hit/miss counters with `AtomicU64` for stats endpoint.
|
||||
|
||||
## Error Handling
|
||||
|
||||
**Location**: `crates/kreuzberg/src/api/error.rs`
|
||||
|
||||
`ApiError` enum maps to HTTP status codes:
|
||||
|
||||
- `MissingFile` -> 400, `FileNotFound` -> 404
|
||||
- `OnnxRuntimeMissing` / `TesseractMissing` -> 503 (with remediation message)
|
||||
- `PayloadTooLarge` -> 413
|
||||
- `ExtractionFailed` / `InvalidConfig` / `UnsupportedFormat` -> 500
|
||||
|
||||
## MCP Server Implementation
|
||||
|
||||
**Location**: `crates/kreuzberg/src/mcp/server.rs`
|
||||
|
||||
The MCP server allows Claude and other AI agents to call Kreuzberg extraction functions through the Model Context Protocol.
|
||||
|
||||
### MCP Tools (Callable Functions)
|
||||
|
||||
Three tools are registered:
|
||||
|
||||
| Tool | Purpose | Required Params |
|
||||
| ------------------ | --------------------------------------------------------- | --------------- |
|
||||
| `extract_file` | Extract text/tables/metadata from documents (75+ formats) | `file_path` |
|
||||
| `batch_extract` | Extract from multiple documents in parallel | `file_paths[]` |
|
||||
| `get_capabilities` | List supported formats, features, backends | (none) |
|
||||
|
||||
**Tool registration pattern** (example: `extract_file`):
|
||||
|
||||
```rust
|
||||
// Define Tool with name, description, JSON Schema inputSchema
|
||||
// Register with server.register_tool(tool, handler_fn)
|
||||
// Handler: parse params -> build ExtractionConfig -> call extract_file() -> return ToolResult as JSON
|
||||
```
|
||||
|
||||
`extract_file` optional params: `format`, `extract_tables`, `extract_images`, `ocr_enabled`, `extract_metadata`, `chunking_preset`, `generate_embeddings`.
|
||||
|
||||
### MCP Resources (Static Knowledge)
|
||||
|
||||
Three resources provide static information to agents:
|
||||
|
||||
- `kreuzberg://formats` -- Supported format list as JSON
|
||||
- `kreuzberg://features` -- Cross-binding feature matrix (from `FEATURE_MATRIX.md`)
|
||||
- `kreuzberg://api-reference` -- Generated API documentation
|
||||
|
||||
### MCP Prompts (Agent Templates)
|
||||
|
||||
Two prompts guide agent extraction workflows:
|
||||
|
||||
- `extract_for_rag` -- Document type-specific RAG extraction guidance (research paper, contract, report). Recommends chunking preset and embedding config.
|
||||
- `batch_document_processing` -- Optimal concurrency, grouping, and error handling for batch workflows.
|
||||
|
||||
### MCP Transport Protocols
|
||||
|
||||
- **HTTP/REST**: MCP routes mounted alongside REST API on separate `/mcp/` prefix
|
||||
- **Stdio**: JSON-RPC 2.0 over stdin/stdout for local CLI integration (e.g., Claude Desktop)
|
||||
|
||||
### Integration with Claude Desktop
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"kreuzberg": {
|
||||
"command": "kreuzberg-mcp",
|
||||
"env": {
|
||||
"KREUZBERG_API_BASE": "http://localhost:8000",
|
||||
"KREUZBERG_MCP_TRANSPORT": "stdio"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### MCP Error Handling
|
||||
|
||||
`ToolError` variants: `FileNotFound`, `UnsupportedFormat`, `ExtractionFailed`, `OnnxRuntimeMissing`, `TesseractMissing`, `Timeout`. Each maps to an MCP `ToolResultError` with descriptive code and message.
|
||||
|
||||
## Environment Configuration
|
||||
|
||||
See `.env.example` for all configurable variables. Key categories:
|
||||
|
||||
- **Server**: `KREUZBERG_HOST`, `KREUZBERG_PORT`
|
||||
- **Size limits**: `KREUZBERG_MAX_REQUEST_BODY_BYTES` (default 100MB), `KREUZBERG_MAX_MULTIPART_FIELD_BYTES`
|
||||
- **Features**: `KREUZBERG_ENABLE_OCR`, `KREUZBERG_ENABLE_EMBEDDINGS`, `KREUZBERG_ENABLE_KEYWORDS`
|
||||
- **Cache**: `KREUZBERG_CACHE_ENABLED`, `KREUZBERG_CACHE_SIZE`
|
||||
- **CORS**: `CORS_ALLOWED_ORIGINS` (comma-separated)
|
||||
- **MCP**: `KREUZBERG_MCP_HOST`, `KREUZBERG_MCP_PORT`, `KREUZBERG_MCP_TRANSPORT` (stdio/http)
|
||||
- **Logging**: `RUST_LOG=kreuzberg=info,tower_http=debug`
|
||||
|
||||
## Critical Rules
|
||||
|
||||
### REST API Rules
|
||||
|
||||
1. **Always validate multipart file uploads** - Check MIME type, size, magic bytes
|
||||
2. **Timeout long-running extractions** - Set per-handler timeout (5 min default)
|
||||
3. **Stream large files** - Never buffer entire multi-GB file in memory
|
||||
4. **Cache aggressively** - Identical files should return from cache in <1ms
|
||||
5. **Parallel extraction is CPU-bound** - Limit workers to CPU count + 1
|
||||
6. **Error responses must be actionable** - Include error code and remediation suggestion
|
||||
7. **Health checks must verify features** - Report missing dependencies (ONNX, Tesseract)
|
||||
8. **Size limits are configurable** - Allow override via env var for large deployments
|
||||
9. **CORS is permissive by default** - Restrict in production via env var
|
||||
10. **Logging all requests** - Track extraction metrics for observability
|
||||
|
||||
### MCP Rules
|
||||
|
||||
1. **All tools must have timeout** - Prevent hanging on large files (default 5 min)
|
||||
2. **Error responses must be detailed** - Include suggestions for missing dependencies
|
||||
3. **Feature gates must be checked** - Return helpful message if feature unavailable (embeddings, OCR)
|
||||
4. **Resources should be static** - Don't query external services in resource handlers
|
||||
5. **Prompts guide agents** - Provide clear examples and best practices
|
||||
6. **Batch tools must support cancellation** - Allow agent to stop long-running batch operations
|
||||
7. **Logging all tool calls** - Track usage for analytics and debugging
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **extraction-pipeline-patterns** - Core extraction called by handlers and MCP tools
|
||||
- **chunking-embeddings** - Optional chunking/embedding parameters in extraction
|
||||
- **ocr-backend-management** - OCR engine selection and image preprocessing
|
||||
120
.ai-rulez/skills/chunking-embeddings/SKILL.md
Normal file
120
.ai-rulez/skills/chunking-embeddings/SKILL.md
Normal file
@@ -0,0 +1,120 @@
|
||||
---
|
||||
description: "Chunking, embeddings, and RAG pipeline integration"
|
||||
name: chunking-embeddings
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Chunking & Embeddings
|
||||
|
||||
**Text splitting strategies, embedding generation with FastEmbed, RAG pipeline integration**
|
||||
|
||||
## Chunking Architecture Overview
|
||||
|
||||
**Location**: `crates/kreuzberg/src/chunking/`, `crates/kreuzberg/src/embeddings.rs`
|
||||
|
||||
```text
|
||||
Extracted Text
|
||||
|
|
||||
[1. Normalization] -> Clean whitespace, remove control chars
|
||||
|
|
||||
[2. Chunk Strategy Selection] -> Fixed-size, semantic, syntax-aware, recursive
|
||||
|
|
||||
[3. Overlap Management] -> Control context window overlap
|
||||
|
|
||||
[4. Optional Embedding] -> Generate vectors with FastEmbed
|
||||
|
|
||||
Output: Vec<Chunk> with text, vectors, metadata
|
||||
```
|
||||
|
||||
## Chunking Strategies
|
||||
|
||||
**Location**: `crates/kreuzberg/src/chunking/mod.rs`
|
||||
|
||||
| Strategy | Pattern | Best For |
|
||||
| --------------------------------- | ------------------------------------------------------- | ------------------------------------------------------------------ |
|
||||
| **Fixed-Size** | Sliding window with configurable overlap | Uniform chunks for embedding models with fixed token limits |
|
||||
| **Semantic** | Split by sentences, merge/split by similarity threshold | Smart context preservation for LLM consumption and semantic search |
|
||||
| **Syntax-Aware** | Split by paragraph/section/heading/code-block structure | Preserving document structure (sections, code blocks) in RAG |
|
||||
| **Recursive** (LangChain pattern) | Try separators in order: `\n\n`, `\n`, `,` | Best general-purpose chunking; auto-finds optimal split points |
|
||||
|
||||
Key config fields per strategy (see struct definitions in `chunking/mod.rs`):
|
||||
|
||||
- Fixed-Size: `chunk_size`, `overlap`, `trim_whitespace`
|
||||
- Semantic: `target_chunk_size`, `min/max_chunk_size`, `semantic_threshold`, `use_sentence_boundaries`
|
||||
- Syntax-Aware: `chunk_by` (Paragraph/Section/Heading/Sentence/CodeBlock), `max_chunk_size`, `respect_code_blocks`
|
||||
- Recursive: `separators[]`, `chunk_size`, `overlap`
|
||||
|
||||
## Chunking Configuration Presets
|
||||
|
||||
**Location**: `crates/kreuzberg/src/chunking/mod.rs`
|
||||
|
||||
| Preset | Chunk Size | Overlap | Strategy | Use Case |
|
||||
| ------------ | ----------- | ------- | ---------- | ---------------------- |
|
||||
| **Balanced** | 512 tokens | 50 | Semantic | RAG sweet spot |
|
||||
| **Compact** | 256 tokens | 32 | Fixed-Size | Dense vectors |
|
||||
| **Extended** | 1024 tokens | 100 | Recursive | Full context |
|
||||
| **Minimal** | 128 tokens | 16 | (default) | Lightweight embeddings |
|
||||
|
||||
Usage: set `config.chunking.preset = Some("balanced")` in `ExtractionConfig`.
|
||||
|
||||
## Embedding Generation with FastEmbed
|
||||
|
||||
**Location**: `crates/kreuzberg/src/embeddings.rs`
|
||||
|
||||
### Model Selection
|
||||
|
||||
| Model | Dimensions | Notes |
|
||||
| ----------------------------------- | ---------- | -------------------------------- |
|
||||
| `BAAI/bge-small-en-v1.5` (default) | 384 | Fast, excellent for RAG |
|
||||
| `BAAI/bge-small-zh-v1.5` | 384 | Chinese optimized |
|
||||
| `BAAI/bge-base-en-v1.5` | 768 | Better quality, slower |
|
||||
| `jinaai/jina-embeddings-v2-base-en` | 768 | Long context (up to 8192 tokens) |
|
||||
| `Custom(path)` | varies | Custom ONNX model path |
|
||||
|
||||
### Embedding Pattern
|
||||
|
||||
`TextEmbeddingManager` provides singleton-cached models per config. Pattern:
|
||||
|
||||
1. `get_or_init_model()` -- lazy-loads ONNX model (downloads if needed), caches in `Arc<RwLock<HashMap>>`
|
||||
2. `embed_chunks()` -- collects chunk texts, calls `model.embed(texts, batch_size)`, zips results back to `ChunkWithEmbedding`
|
||||
|
||||
Default config: `batch_size=256`, `device=CPU`, `parallel_requests=4`.
|
||||
|
||||
### ONNX Runtime Requirement
|
||||
|
||||
Embeddings require ONNX Runtime. Feature-gated via:
|
||||
|
||||
```toml
|
||||
[features]
|
||||
embeddings = ["dep:fastembed", "dep:ort"]
|
||||
```
|
||||
|
||||
Install: `brew install onnxruntime` (macOS) / `apt install libonnxruntime libonnxruntime-dev` (Linux). Verify: `echo $ORT_DYLIB_PATH`.
|
||||
|
||||
## RAG Integration Pattern
|
||||
|
||||
The full extraction-to-RAG pipeline:
|
||||
|
||||
1. **Extract**: `extract_file(path, config)` -> `ExtractionResult`
|
||||
2. **Chunk**: Apply preset strategy to `result.content` -> `Vec<Chunk>`
|
||||
3. **Embed**: If embedding config present, `TextEmbeddingManager::embed_chunks()` -> `Vec<ChunkWithEmbedding>`
|
||||
4. **Output**: `RagDocument { file_path, metadata, chunks }` ready for vector DB ingestion
|
||||
|
||||
See `ChunkWithEmbedding` struct in `types.rs`: contains `text`, `embedding: Vec<f32>`, `dimensions`, `norm`, `metadata`.
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **Chunking is preprocessing** - Always apply before embedding to ensure consistent vector sizes
|
||||
2. **Overlap prevents information loss** - Set overlap to 15-20% of chunk size
|
||||
3. **Embedding models are stateful** - Lazy load and cache to avoid repeated initialization
|
||||
4. **ONNX Runtime is required** - Gracefully degrade if not available (skip embeddings)
|
||||
5. **Batch embedding for performance** - Never embed single chunks; batch 50-1000 chunks
|
||||
6. **Normalize embeddings for search** - Use L2 norm for cosine similarity
|
||||
7. **Cache embedding results** - Don't re-embed identical text chunks
|
||||
8. **Model selection impacts quality** - bge-small (384) for speed, bge-base (768) for quality
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **extraction-pipeline-patterns** - Text extraction preceding chunking
|
||||
- **api-server-mcp** - Endpoint for chunking + embedding operations
|
||||
- **ocr-backend-management** - OCR text quality affects chunking success
|
||||
126
.ai-rulez/skills/extraction-pipeline-patterns/SKILL.md
Normal file
126
.ai-rulez/skills/extraction-pipeline-patterns/SKILL.md
Normal file
@@ -0,0 +1,126 @@
|
||||
---
|
||||
description: "Document extraction pipeline architecture and patterns"
|
||||
name: extraction-pipeline-patterns
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Extraction Pipeline Patterns
|
||||
|
||||
**Kreuzberg's format detection -> extraction -> fallback orchestration for 75+ file formats**
|
||||
|
||||
## Core Pipeline Architecture
|
||||
|
||||
The extraction pipeline (`crates/kreuzberg/src/core/pipeline.rs`, `crates/kreuzberg/src/extraction/`) orchestrates:
|
||||
|
||||
1. **Format Detection** - MIME type inference + extension validation -> select appropriate extractor
|
||||
2. **Intelligent Extraction** - Route to format-specific extractors (PDF, DOCX, Excel, HTML, images, archives, etc.)
|
||||
3. **Fallback Strategies** - Password-protected PDFs, OCR for images, nested archive handling, corrupted file recovery
|
||||
4. **Post-Processing Pipeline** - Validators, quality processing, chunking, custom hooks (see `core/pipeline.rs`)
|
||||
|
||||
## Format Detection Strategy
|
||||
|
||||
**Location**: `crates/kreuzberg/src/core/mime.rs`, `crates/kreuzberg/src/core/formats.rs`
|
||||
|
||||
Pattern: detect via magic bytes, validate extension alignment (prevent spoofing), route to extractor. Multiple extractors for same format -> choose highest confidence/specificity.
|
||||
|
||||
```rust
|
||||
// Pseudocode: core/mime.rs
|
||||
match (magic_bytes(content), extension) {
|
||||
(Some(fmt), Some(ext)) if aligned -> Ok(fmt),
|
||||
(Some(fmt), Some(ext)) if misaligned -> Err(FormatMismatch),
|
||||
(Some(fmt), None) -> Ok(fmt), // magic bytes only
|
||||
(None, Some(ext)) -> Ok(from_extension(ext)),
|
||||
_ -> Err(UnknownFormat),
|
||||
}
|
||||
```
|
||||
|
||||
## Extraction Modules (75 Formats)
|
||||
|
||||
| Category | Extractors | Key Modules |
|
||||
| ------------ | ------------------------------------------------ | ---------------------------------------------------- |
|
||||
| **Office** | DOCX, XLSX, XLSM, XLSB, XLS, PPTX, ODP, ODS | `extraction/{docx,excel,pptx}.rs` |
|
||||
| **PDF** | Standard + encrypted, password attempts | `pdf/` subdirectory (13 files) |
|
||||
| **Images** | PNG, JPG, TIFF, WebP, JP2, SVG (OCR-enabled) | `extraction/image.rs` + `ocr/` |
|
||||
| **Web** | HTML, XHTML, XML, SVG (DOM parsing) | `extraction/html.rs` (67KB - complex table handling) |
|
||||
| **Email** | EML, MSG (headers, body, attachments, threading) | `extraction/email.rs` |
|
||||
| **Archives** | ZIP, TAR, GZ, 7Z (recursive extraction) | `extraction/archive.rs` (31KB) |
|
||||
| **Markdown** | MD, TXT, RST, Org Mode, RTF | `extraction/markdown.rs` |
|
||||
| **Academic** | LaTeX, BibTeX, JATS, Jupyter, DocBook | `extraction/{structured,xml}.rs` |
|
||||
|
||||
## Extraction Dispatcher
|
||||
|
||||
```rust
|
||||
// Pseudocode: extraction/mod.rs
|
||||
let format = detect_format(source.bytes, source.extension);
|
||||
let result = match format {
|
||||
Pdf -> extract_pdf(source, config),
|
||||
Docx -> extract_docx(source, config),
|
||||
Image -> extract_image_with_ocr_fallback(source, config),
|
||||
Archive -> extract_archive_recursive(source, config),
|
||||
_ -> extract_with_plugin(format, source, config),
|
||||
};
|
||||
run_pipeline(result, config) // post-processing always runs
|
||||
```
|
||||
|
||||
## Fallback Strategies
|
||||
|
||||
- **Password-Protected PDFs**: Try primary password -> secondary password list -> return `is_encrypted=true` in metadata on failure
|
||||
- **OCR Fallback**: If image text extraction confidence < threshold, trigger OCR backend; return both results with scores
|
||||
- **Nested Archives**: Recursive extraction with configurable depth limit; flatten or preserve hierarchy
|
||||
- **Corrupted File Recovery**: Stream-based parsing, emit content up to error point, include error location in metadata
|
||||
|
||||
## Configuration Integration
|
||||
|
||||
**Location**: `crates/kreuzberg/src/core/config.rs`, `crates/kreuzberg/src/core/config_validation.rs`
|
||||
|
||||
`ExtractionConfig` holds format-specific configs (`pdf`, `image`, `html`, `office`), fallback orchestration (`fallback`), and post-processing (`postprocessor`, `chunking`, `keywords`). See struct definition in `config.rs`.
|
||||
|
||||
## Plugin System Integration
|
||||
|
||||
**Location**: `crates/kreuzberg/src/plugins/`
|
||||
|
||||
- **CustomExtractor**: Override built-in format extractors
|
||||
- **PostProcessor**: Modify results after extraction (Early/Middle/Late stages)
|
||||
- **Validator**: Fail-fast validation (e.g., minimum text length)
|
||||
- **OCRBackend**: Swap OCR engine
|
||||
|
||||
Plugin registry loaded at startup, cached for zero-cost lookup.
|
||||
|
||||
## Feature Flag Strategy
|
||||
|
||||
**Location**: `Cargo.toml` (workspace), `crates/kreuzberg/Cargo.toml`, `FEATURE_MATRIX.md`
|
||||
|
||||
20+ features across 9 language bindings. Key feature groups:
|
||||
|
||||
| Group | Features | Notes |
|
||||
| -------- | ------------------------------------------------------------------------------------ | --------------------------------- |
|
||||
| OCR | `tesseract` (default), `tesseract-static`, `ocr-minimal` | Mutually exclusive recommendation |
|
||||
| Formats | `pdf`, `pdf-minimal`, `office`, `office-minimal` | |
|
||||
| AI/ML | `embeddings` (requires ONNX), `keywords-yake`, `keywords-rake`, `language-detection` | |
|
||||
| Server | `api` (Axum), `mcp`, `tokio-runtime`, `lite-runtime` | |
|
||||
| Bindings | `python-bindings`, `ruby-bindings`, `php-bindings`, `node-bindings`, `wasm` | |
|
||||
|
||||
Conditional compilation: modules gated with `#[cfg(feature = "...")]`. Runtime `validate_config()` warns if requested feature not compiled in.
|
||||
|
||||
### Feature Flag Critical Rules
|
||||
|
||||
1. **Never mix conflicting features** - e.g., `ocr-minimal` + `tesseract` should error at compile time
|
||||
2. **Always provide feature diagnostics** - Config validation must warn if feature unavailable
|
||||
3. **Default to maximum feature set** - Unless embedded/minimal specifically requested
|
||||
4. **Test all feature combinations** - Matrix testing in CI catches regressions
|
||||
5. **WASM incompatible** with embeddings, keywords, OCR
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. **Always use format detection** before routing to extractors (prevent confusion attacks)
|
||||
2. **Stream-based parsing** for PDFs/archives to handle multi-GB files
|
||||
3. **Post-pipeline is mandatory**: All extraction results flow through `run_pipeline()` for validators/hooks
|
||||
4. **Plugin overrides are order-dependent**: Plugins registered first take priority
|
||||
5. **Fallback timeouts**: Set reasonable OCR/archive extraction timeouts (config-driven)
|
||||
6. **Metadata preservation**: Include format detection confidence, extraction method used, any fallbacks applied
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **ocr-backend-management** - OCR engine selection and image preprocessing
|
||||
- **chunking-embeddings** - Post-extraction text splitting with FastEmbed
|
||||
- **api-server-mcp** - Axum endpoint for extraction pipeline exposure and MCP server
|
||||
78
.ai-rulez/skills/format-specific-extraction/SKILL.md
Normal file
78
.ai-rulez/skills/format-specific-extraction/SKILL.md
Normal file
@@ -0,0 +1,78 @@
|
||||
---
|
||||
name: format-specific-extraction
|
||||
description: "Format-specific document extraction workflows"
|
||||
priority: high
|
||||
---
|
||||
|
||||
# Format-Specific Extraction Workflows
|
||||
|
||||
## Office XML (DOCX/PPTX/ODT)
|
||||
|
||||
```text
|
||||
ZIP archive → Security validation → XML parsing → Text + tables + metadata
|
||||
```
|
||||
|
||||
1. `ZipBombValidator::new(limits).validate(&mut archive)?`
|
||||
2. Extract XML files from archive (`word/document.xml`, `ppt/slides/*.xml`, `content.xml`)
|
||||
3. Parse with `quick-xml::Reader` (streaming) + `DepthValidator` + `StringGrowthValidator`
|
||||
4. Extract metadata via `crate::extraction::office_metadata::extract_metadata()`
|
||||
5. See: `extractors/docx.rs`, `extractors/pptx.rs`, `extractors/odt.rs`
|
||||
|
||||
## PDF
|
||||
|
||||
```text
|
||||
Bytes → pdf_oxide → Per-page text + OCR fallback → Tables → Metadata
|
||||
```
|
||||
|
||||
1. `pdf_oxide::PdfDocument::from_bytes(content)?`
|
||||
2. Check if needs OCR: `config.force_ocr || !has_searchable_text()`
|
||||
3. Extract text per page, tables if `config.pages` enabled
|
||||
4. Feature-gated: `#[cfg(feature = "pdf")]`
|
||||
5. See: `extractors/pdf/mod.rs`
|
||||
|
||||
## Archives (ZIP/TAR/7z/GZIP)
|
||||
|
||||
```text
|
||||
Validate → Extract metadata → Extract plaintext files only
|
||||
```
|
||||
|
||||
1. `ZipBombValidator` BEFORE any extraction
|
||||
2. Extract metadata (file list, sizes)
|
||||
3. Extract text content from plaintext files
|
||||
4. Use `build_archive_result()` helper
|
||||
5. See: `extractors/archive.rs`, `extraction/archive/*.rs`
|
||||
|
||||
## Structured Text (JSON/YAML/TOML/XML)
|
||||
|
||||
```text
|
||||
Detect format from MIME → Parse → Pretty-print → Metadata
|
||||
```
|
||||
|
||||
Single `StructuredExtractor` handles multiple MIME types. Parse with format-specific library, pretty-print to text.
|
||||
See: `extractors/structured.rs`
|
||||
|
||||
## Email (EML/MSG)
|
||||
|
||||
```text
|
||||
Parse headers → Extract body (text/html) → Process attachments
|
||||
```
|
||||
|
||||
See: `extraction/email.rs`, `extractors/email.rs`
|
||||
|
||||
## Common Helpers
|
||||
|
||||
| Helper | Location | Purpose |
|
||||
| ------------------------------------- | --------------------------- | ------------------------------ |
|
||||
| `office_metadata::extract_metadata()` | `extraction/office.rs` | Office XML metadata |
|
||||
| `cells_to_markdown()` | `extraction/mod.rs` | Convert cell grid to GFM table |
|
||||
| `build_archive_result()` | `extraction/archive/mod.rs` | Standard archive result |
|
||||
|
||||
## Adding a New Format
|
||||
|
||||
1. Add MIME type to `EXT_TO_MIME` in `core/mime.rs`
|
||||
2. Create extractor implementing `DocumentExtractor` trait
|
||||
3. Set `supported_mime_types()` and `priority()` (default: 50)
|
||||
4. Register in `extractors/mod.rs` → `register_default_extractors()`
|
||||
5. Feature-gate if optional: `#[cfg(feature = "my-format")]`
|
||||
6. Apply security validators for user content
|
||||
7. Add tests with fixture files
|
||||
97
.ai-rulez/skills/plugin-architecture-patterns/SKILL.md
Normal file
97
.ai-rulez/skills/plugin-architecture-patterns/SKILL.md
Normal file
@@ -0,0 +1,97 @@
|
||||
---
|
||||
name: plugin-architecture-patterns
|
||||
description: "Plugin architecture, registration, and trait patterns"
|
||||
priority: critical
|
||||
---
|
||||
|
||||
# Plugin Architecture & Registration
|
||||
|
||||
## Plugin Types
|
||||
|
||||
| Type | Trait | Location |
|
||||
| ------------------ | --------------------------- | ---------------------------- |
|
||||
| Document Extractor | `DocumentExtractor: Plugin` | `plugins/extractor/trait.rs` |
|
||||
| OCR Backend | `OcrBackend: Plugin` | `plugins/ocr/trait.rs` |
|
||||
| Post Processor | `PostProcessor: Plugin` | `plugins/processor/trait.rs` |
|
||||
| Validator | `Validator: Plugin` | `plugins/validator/trait.rs` |
|
||||
|
||||
## DocumentExtractor Implementation
|
||||
|
||||
```rust
|
||||
use crate::plugins::{DocumentExtractor, Plugin};
|
||||
use async_trait::async_trait;
|
||||
|
||||
pub struct MyExtractor;
|
||||
|
||||
impl Plugin for MyExtractor {
|
||||
fn name(&self) -> &str { "my-extractor" }
|
||||
fn version(&self) -> String { env!("CARGO_PKG_VERSION").to_string() }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for MyExtractor {
|
||||
async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
|
||||
-> Result<ExtractionResult> { /* ... */ }
|
||||
|
||||
fn supported_mime_types(&self) -> &[&str] { &["application/x-custom"] }
|
||||
fn priority(&self) -> i32 { 50 }
|
||||
|
||||
// WASM support (optional)
|
||||
fn as_sync_extractor(&self) -> Option<&dyn SyncExtractor> { None }
|
||||
}
|
||||
```
|
||||
|
||||
## Priority System
|
||||
|
||||
| Range | Use |
|
||||
| ------ | ------------------------- |
|
||||
| 0-25 | Fallback/low-quality |
|
||||
| 26-49 | Alternative extractors |
|
||||
| **50** | **Default (built-in)** |
|
||||
| 51-75 | Premium/enhanced |
|
||||
| 76-100 | Specialized/high-priority |
|
||||
|
||||
Registry selects **highest priority** extractor for each MIME type. Override built-ins with priority > 50.
|
||||
|
||||
## Registration
|
||||
|
||||
```rust
|
||||
// In extractors/mod.rs → register_default_extractors()
|
||||
let registry = get_document_extractor_registry();
|
||||
let mut registry = registry.write()
|
||||
.map_err(|e| KreuzbergError::Other(format!("Registry lock poisoned: {}", e)))?;
|
||||
registry.register(Arc::new(MyExtractor::new()))?;
|
||||
```
|
||||
|
||||
## Feature-Gated Registration
|
||||
|
||||
```rust
|
||||
#[cfg(feature = "office")]
|
||||
{
|
||||
registry.register(Arc::new(DocxExtractor::new()))?;
|
||||
registry.register(Arc::new(PptxExtractor::new()))?;
|
||||
}
|
||||
```
|
||||
|
||||
## PostProcessor Pattern
|
||||
|
||||
```rust
|
||||
impl PostProcessor for MyProcessor {
|
||||
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
||||
-> Result<()> {
|
||||
result.content = process_content(&result.content);
|
||||
Ok(())
|
||||
}
|
||||
fn stage(&self) -> ProcessorStage { ProcessorStage::Middle }
|
||||
}
|
||||
```
|
||||
|
||||
Stages: `Early` → `Middle` → `Late`. Failures isolated (don't block others).
|
||||
|
||||
## Critical Rules
|
||||
|
||||
1. All plugins **MUST be `Send + Sync`**
|
||||
2. Feature gate with `#[cfg(feature = "...")]` for optional formats
|
||||
3. Use `#[async_trait]` for `DocumentExtractor`
|
||||
4. Initialization via `ensure_initialized()` (lazy, called before first extraction)
|
||||
5. Plugin names: kebab-case (e.g., `"pdf-extractor"`)
|
||||
Reference in New Issue
Block a user