Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,95 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
source "$REPO_ROOT/scripts/lib/common.sh"
source "$REPO_ROOT/scripts/lib/library-paths.sh"
validate_repo_root "$REPO_ROOT" || exit 1
setup_rust_ffi_paths "$REPO_ROOT"
echo "=== Compiling Ruby native extension (Verbose Debug) ==="
cd "$REPO_ROOT/packages/ruby"
export CARGO_BUILD_JOBS=1
export RUST_BACKTRACE=1
export RB_SYS_VERBOSE=1
echo ""
echo "=== Pre-compilation environment ==="
echo "Ruby version: $(ruby --version)"
echo "Ruby platform: $(ruby -e 'puts RUBY_PLATFORM')"
echo "Rustc version: $(rustc --version)"
echo "Cargo version: $(cargo --version)"
echo "Working directory: $(pwd)"
echo ""
echo "=== Build configuration variables ==="
echo "CARGO_BUILD_JOBS: ${CARGO_BUILD_JOBS}"
echo "RUST_BACKTRACE: ${RUST_BACKTRACE}"
echo "RB_SYS_VERBOSE: ${RB_SYS_VERBOSE}"
echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-<not set>}"
echo "DYLD_LIBRARY_PATH: ${DYLD_LIBRARY_PATH:-<not set>}"
echo ""
echo "=== Pre-vendor directory state ==="
echo "packages/ruby directory contents:"
find . -maxdepth 1 -type f -o -maxdepth 1 -type d | head -20
echo ""
echo "=== Vendoring kreuzberg core ==="
python3 "$REPO_ROOT/scripts/ci/ruby/vendor-kreuzberg-core.py"
echo ""
echo "=== Post-vendor directory state ==="
if [ -d "ext/kreuzberg_rb/vendor" ]; then
echo "Vendor directory contents:"
find ext/kreuzberg_rb/vendor -maxdepth 2 -type f | head -10
else
echo "WARNING: No vendor directory found in ext/kreuzberg_rb"
fi
echo ""
echo "=== Running rake compile with verbose output ==="
bundle exec rake compile --verbose --trace 2>&1 || {
echo ""
echo "ERROR: rake compile failed"
echo "=== Attempting to capture compilation error details ==="
if [ -f "mkmf.log" ]; then
echo "=== mkmf.log (last 150 lines) ==="
tail -150 mkmf.log
fi
echo ""
echo "=== Looking for compiled artifacts ==="
find . -name "*.so" -o -name "*.dll" -o -name "*.dylib" 2>/dev/null | head -20
echo ""
echo "=== Checking gem installation ==="
gem list kreuzberg || echo "Gem not found"
exit 1
}
echo ""
echo "=== Post-compilation directory state ==="
echo "lib/ contents:"
if [ -d "lib" ]; then
find lib -type f -name "*.so" -o -name "*.dll" -o -name "*.dylib" 2>/dev/null || echo "No compiled extension found"
else
echo "ERROR: lib directory not found"
fi
echo ""
echo "=== Verifying extension can be loaded ==="
ruby -e "require_relative 'lib/kreuzberg'; puts 'Extension loaded successfully'" 2>&1 || {
echo "WARNING: Could not load extension directly"
echo "This might be expected if gem installation is required"
}
echo ""
echo "=== Compilation complete ==="

View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -euo pipefail
gem install bundler -v 4.0.3 --no-document || gem install bundler --no-document
bundler --version

View File

@@ -0,0 +1,30 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
source "$REPO_ROOT/scripts/lib/common.sh"
validate_repo_root "$REPO_ROOT" || exit 1
echo "=== Installing Ruby dependencies ==="
cd "$REPO_ROOT/packages/ruby"
bundle_path="${BUNDLE_PATH:-$REPO_ROOT/packages/ruby/.bundle/bundle}"
if [[ -n "${GITHUB_ENV:-}" ]]; then
if [[ -z "${BUNDLE_GEMFILE:-}" ]]; then
echo "BUNDLE_GEMFILE=$REPO_ROOT/packages/ruby/Gemfile" >>"$GITHUB_ENV"
fi
if [[ -z "${BUNDLE_PATH:-}" ]]; then
echo "BUNDLE_PATH=$bundle_path" >>"$GITHUB_ENV"
fi
fi
bundle config set deployment false
bundle config set path "$bundle_path"
bundle install --jobs 4
echo "Ruby dependencies installed"

View File

@@ -0,0 +1,430 @@
#!/usr/bin/env python3
"""
Vendor kreuzberg core crate into Ruby package
Used by: ci-ruby.yaml - Vendor kreuzberg core crate step
This script:
1. Reads workspace.dependencies from root Cargo.toml
2. Copies core crates to packages/ruby/vendor/
3. Replaces workspace = true with explicit versions
4. Generates vendor/Cargo.toml with proper workspace setup
"""
import os
import sys
import shutil
import re
from pathlib import Path
try:
import tomllib
except ImportError:
import tomli as tomllib # type: ignore[import-not-found]
def get_repo_root() -> Path:
"""Get repository root directory."""
repo_root_env = os.environ.get("REPO_ROOT")
if repo_root_env:
return Path(repo_root_env)
script_dir = Path(__file__).parent.absolute()
return (script_dir / ".." / ".." / "..").resolve()
def read_toml(path: Path) -> dict[str, object]:
"""Read TOML file."""
with open(path, "rb") as f:
return tomllib.load(f)
def get_workspace_deps(repo_root: Path) -> dict[str, object]:
"""Extract workspace.dependencies from root Cargo.toml."""
cargo_toml_path = repo_root / "Cargo.toml"
data = read_toml(cargo_toml_path)
return data.get("workspace", {}).get("dependencies", {})
def get_workspace_version(repo_root: Path) -> str:
"""Extract version from workspace.package."""
cargo_toml_path = repo_root / "Cargo.toml"
data = read_toml(cargo_toml_path)
return data.get("workspace", {}).get("package", {}).get("version", "4.0.0")
def format_dependency(name: str, dep_spec: object) -> str:
"""Format a dependency spec for Cargo.toml."""
if isinstance(dep_spec, str):
return f'{name} = "{dep_spec}"'
elif isinstance(dep_spec, dict):
version: str = dep_spec.get("version", "")
package: str | None = dep_spec.get("package")
features: list[str] = dep_spec.get("features", [])
default_features: bool | None = dep_spec.get("default-features")
optional: bool | None = dep_spec.get("optional")
path: str | None = dep_spec.get("path")
git: str | None = dep_spec.get("git")
branch: str | None = dep_spec.get("branch")
tag: str | None = dep_spec.get("tag")
rev: str | None = dep_spec.get("rev")
parts: list[str] = []
if package:
parts.append(f'package = "{package}"')
if git:
parts.append(f'git = "{git}"')
if branch:
parts.append(f'branch = "{branch}"')
if tag:
parts.append(f'tag = "{tag}"')
if rev:
parts.append(f'rev = "{rev}"')
if path:
parts.append(f'path = "{path}"')
if version:
parts.append(f'version = "{version}"')
if features:
features_str = ', '.join(f'"{f}"' for f in features)
parts.append(f'features = [{features_str}]')
if default_features is False:
parts.append('default-features = false')
elif default_features is True:
parts.append('default-features = true')
if optional is True:
parts.append('optional = true')
elif optional is False:
parts.append('optional = false')
spec_str = ", ".join(parts)
return f"{name} = {{ {spec_str} }}"
return f'{name} = "{dep_spec}"'
def replace_workspace_deps_in_toml(toml_path: Path, workspace_deps: dict[str, object]) -> None:
"""Replace workspace = true with explicit versions in a Cargo.toml file."""
with open(toml_path, "r") as f:
content = f.read()
for name, dep_spec in workspace_deps.items():
pattern1 = rf'^{re.escape(name)} = \{{ workspace = true \}}$'
content = re.sub(pattern1, format_dependency(name, dep_spec), content, flags=re.MULTILINE)
def replace_with_fields(match: re.Match[str]) -> str:
other_fields_str = match.group(1).strip()
base_spec = format_dependency(name, dep_spec)
if " = { " not in base_spec:
# Simple string dep like `ctor = "0.6"` - wrap it
version_val = base_spec.split(" = ", 1)[1].strip('"')
spec_part = f'version = "{version_val}"'
else:
spec_part = base_spec.split(" = { ", 1)[1].rstrip("} ").rstrip("}")
# Extract existing keys and values from workspace spec, handling nested brackets
workspace_fields: dict[str, str] = {}
bracket_depth = 0
current_field = ""
for char in spec_part:
if char == '[':
bracket_depth += 1
current_field += char
elif char == ']':
bracket_depth -= 1
current_field += char
elif char == ',' and bracket_depth == 0:
# End of field
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
workspace_fields[key.strip()] = val.strip()
current_field = ""
else:
current_field += char
# Don't forget the last field
if current_field.strip():
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
workspace_fields[key.strip()] = val.strip()
# Extract crate-specific keys using bracket-aware parsing
crate_fields: dict[str, str] = {}
bracket_depth = 0
current_field = ""
for char in other_fields_str:
if char == '[':
bracket_depth += 1
current_field += char
elif char == ']':
bracket_depth -= 1
current_field += char
elif char == ',' and bracket_depth == 0:
# End of field
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
crate_fields[key.strip()] = val.strip()
current_field = ""
else:
current_field += char
# Don't forget the last field
if current_field.strip():
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
crate_fields[key.strip()] = val.strip()
# Merge: crate-specific fields override workspace fields
merged_fields = {**workspace_fields, **crate_fields}
# Build result from merged fields
merged_parts = [f"{k} = {v}" for k, v in merged_fields.items()]
merged_spec = ", ".join(merged_parts)
return f"{name} = {{ {merged_spec} }}"
pattern2 = rf'^{re.escape(name)} = \{{ workspace = true, (.+?) \}}$'
content = re.sub(pattern2, replace_with_fields, content, flags=re.MULTILINE | re.DOTALL)
with open(toml_path, "w") as f:
f.write(content)
def generate_vendor_cargo_toml(repo_root: Path, workspace_deps: dict[str, object], core_version: str, copied_crates: list[str]) -> None:
"""Generate vendor/Cargo.toml with workspace setup.
Args:
repo_root: Repository root directory
workspace_deps: Workspace dependencies from Cargo.toml
core_version: Core version string
copied_crates: List of crates that were successfully copied
"""
deps_lines: list[str] = []
for name, dep_spec in sorted(workspace_deps.items()):
deps_lines.append(format_dependency(name, dep_spec))
deps_str = "\n".join(deps_lines)
# Build members list based on actually copied crates
members = [name for name in ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "rb-sys"]
if name in copied_crates]
members_str = ', '.join(f'"{m}"' for m in members)
vendor_toml = f'''[workspace]
members = [{members_str}]
[workspace.package]
version = "{core_version}"
edition = "2024"
rust-version = "1.91"
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
license = "MIT"
repository = "https://github.com/kreuzberg-dev/kreuzberg"
homepage = "https://kreuzberg.dev"
[workspace.dependencies]
{deps_str}
'''
vendor_dir = repo_root / "packages" / "ruby" / "vendor"
vendor_dir.mkdir(parents=True, exist_ok=True)
toml_path = vendor_dir / "Cargo.toml"
with open(toml_path, "w") as f:
f.write(vendor_toml)
def main() -> None:
"""Main vendoring function."""
repo_root: Path = get_repo_root()
print("=== Vendoring kreuzberg core crate ===")
workspace_deps: dict[str, object] = get_workspace_deps(repo_root)
core_version: str = get_workspace_version(repo_root)
print(f"Core version: {core_version}")
print(f"Workspace dependencies: {len(workspace_deps)}")
vendor_base: Path = repo_root / "packages" / "ruby" / "vendor"
# Clean only crate directories, preserving vendor/bundle/ (Bundler gems)
crate_names = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract",
"kreuzberg-paddle-ocr", "rb-sys"]
for name in crate_names:
crate_path = vendor_base / name
if crate_path.exists():
shutil.rmtree(crate_path)
# Also clean the vendor Cargo.toml (will be regenerated)
vendor_cargo = vendor_base / "Cargo.toml"
if vendor_cargo.exists():
vendor_cargo.unlink()
print("Cleaned vendor crate directories")
vendor_base.mkdir(parents=True, exist_ok=True)
crates_to_copy: list[tuple[str, str]] = [
("crates/kreuzberg", "kreuzberg"),
("crates/kreuzberg-ffi", "kreuzberg-ffi"),
("crates/kreuzberg-tesseract", "kreuzberg-tesseract"),
("crates/kreuzberg-paddle-ocr", "kreuzberg-paddle-ocr"),
("vendor/rb-sys", "rb-sys"),
]
copied_crates: list[str] = []
for src_rel, dest_name in crates_to_copy:
src: Path = repo_root / src_rel
dest: Path = vendor_base / dest_name
if src.exists():
try:
shutil.copytree(src, dest)
copied_crates.append(dest_name)
print(f"Copied {dest_name}")
except Exception as e:
print(f"Warning: Failed to copy {dest_name}: {e}", file=sys.stderr)
else:
print(f"Warning: Source directory not found: {src_rel}")
artifact_dirs: list[str] = [".fastembed_cache", "target"]
temp_patterns: list[str] = ["*.swp", "*.bak", "*.tmp", "*~"]
for crate_dir in copied_crates:
crate_path: Path = vendor_base / crate_dir
if crate_path.exists():
for artifact_dir in artifact_dirs:
artifact: Path = crate_path / artifact_dir
if artifact.exists():
shutil.rmtree(artifact)
for pattern in temp_patterns:
for f in crate_path.rglob(pattern):
f.unlink()
print("Cleaned build artifacts")
# Update workspace inheritance in Cargo.toml files
for crate_dir in copied_crates:
crate_toml = vendor_base / crate_dir / "Cargo.toml"
if crate_toml.exists():
with open(crate_toml, "r") as f:
content = f.read()
content = re.sub(r'^version\.workspace = true$', f'version = "{core_version}"', content, flags=re.MULTILINE)
content = re.sub(r'^edition\.workspace = true$', 'edition = "2024"', content, flags=re.MULTILINE)
content = re.sub(r'^rust-version\.workspace = true$', 'rust-version = "1.91"', content, flags=re.MULTILINE)
content = re.sub(r'^authors\.workspace = true$', 'authors = ["Na\'aman Hirschfeld <naaman@kreuzberg.dev>"]', content, flags=re.MULTILINE)
content = re.sub(r'^license\.workspace = true$', 'license = "MIT"', content, flags=re.MULTILINE)
with open(crate_toml, "w") as f:
f.write(content)
replace_workspace_deps_in_toml(crate_toml, workspace_deps)
print(f"Updated {crate_dir}/Cargo.toml")
# Update path dependencies in kreuzberg-ffi crate
if "kreuzberg-ffi" in copied_crates and "kreuzberg" in copied_crates:
ffi_toml = vendor_base / "kreuzberg-ffi" / "Cargo.toml"
if ffi_toml.exists():
with open(ffi_toml, "r") as f:
content = f.read()
# Replace kreuzberg workspace references with path dependency
# Handle cases with path, version, or neither
content = re.sub(
r'(kreuzberg = \{) (?:(?:path|version) = "[^"]*", )?',
r'\1 path = "../kreuzberg", ',
content
)
with open(ffi_toml, "w") as f:
f.write(content)
# Update path dependencies in kreuzberg crate if tesseract was copied
if "kreuzberg" in copied_crates:
kreuzberg_toml = vendor_base / "kreuzberg" / "Cargo.toml"
if kreuzberg_toml.exists():
with open(kreuzberg_toml, "r") as f:
content = f.read()
# Only update tesseract path if it was actually copied
if "kreuzberg-tesseract" in copied_crates:
content = re.sub(
r'kreuzberg-tesseract = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
'kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }',
content
)
# Only update paddle-ocr path if it was actually copied
if "kreuzberg-paddle-ocr" in copied_crates:
content = re.sub(
r'kreuzberg-paddle-ocr = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
'kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }',
content
)
with open(kreuzberg_toml, "w") as f:
f.write(content)
generate_vendor_cargo_toml(repo_root, workspace_deps, core_version, copied_crates)
print("Generated vendor/Cargo.toml")
# Update native extension Cargo.toml to use vendored crates
native_toml = repo_root / "packages" / "ruby" / "ext" / "kreuzberg_rb" / "native" / "Cargo.toml"
if native_toml.exists():
with open(native_toml, "r") as f:
content = f.read()
# Replace path dependencies to point to vendored crates
# From: path = "../../../../../crates/kreuzberg"
# To: path = "../../../vendor/kreuzberg"
content = re.sub(
r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg"',
'path = "../../../vendor/kreuzberg"',
content
)
content = re.sub(
r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg-ffi"',
'path = "../../../vendor/kreuzberg-ffi"',
content
)
with open(native_toml, "w") as f:
f.write(content)
print("Updated native extension Cargo.toml to use vendored crates")
print(f"\nVendoring complete (core version: {core_version})")
print(f"Copied crates: {', '.join(sorted(copied_crates))}")
if "kreuzberg" in copied_crates and "kreuzberg-ffi" in copied_crates:
print("Native extension Cargo.toml uses:")
print(" - path '../../../vendor/kreuzberg' for kreuzberg crate")
print(" - path '../../../vendor/kreuzberg-ffi' for kreuzberg-ffi crate")
if "rb-sys" in copied_crates:
print(" - path '../../../vendor/rb-sys' for rb-sys crate")
else:
print(" - rb-sys from crates.io")
else:
print("Warning: Some required crates were not copied. Check for missing source directories.")
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)