431 lines
16 KiB
Python
431 lines
16 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Vendor kreuzberg core crate into Ruby package
|
||
|
|
Used by: ci-ruby.yaml - Vendor kreuzberg core crate step
|
||
|
|
|
||
|
|
This script:
|
||
|
|
1. Reads workspace.dependencies from root Cargo.toml
|
||
|
|
2. Copies core crates to packages/ruby/vendor/
|
||
|
|
3. Replaces workspace = true with explicit versions
|
||
|
|
4. Generates vendor/Cargo.toml with proper workspace setup
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import shutil
|
||
|
|
import re
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
try:
|
||
|
|
import tomllib
|
||
|
|
except ImportError:
|
||
|
|
import tomli as tomllib # type: ignore[import-not-found]
|
||
|
|
|
||
|
|
|
||
|
|
def get_repo_root() -> Path:
|
||
|
|
"""Get repository root directory."""
|
||
|
|
repo_root_env = os.environ.get("REPO_ROOT")
|
||
|
|
if repo_root_env:
|
||
|
|
return Path(repo_root_env)
|
||
|
|
|
||
|
|
script_dir = Path(__file__).parent.absolute()
|
||
|
|
return (script_dir / ".." / ".." / "..").resolve()
|
||
|
|
|
||
|
|
|
||
|
|
def read_toml(path: Path) -> dict[str, object]:
|
||
|
|
"""Read TOML file."""
|
||
|
|
with open(path, "rb") as f:
|
||
|
|
return tomllib.load(f)
|
||
|
|
|
||
|
|
|
||
|
|
def get_workspace_deps(repo_root: Path) -> dict[str, object]:
|
||
|
|
"""Extract workspace.dependencies from root Cargo.toml."""
|
||
|
|
cargo_toml_path = repo_root / "Cargo.toml"
|
||
|
|
data = read_toml(cargo_toml_path)
|
||
|
|
return data.get("workspace", {}).get("dependencies", {})
|
||
|
|
|
||
|
|
|
||
|
|
def get_workspace_version(repo_root: Path) -> str:
|
||
|
|
"""Extract version from workspace.package."""
|
||
|
|
cargo_toml_path = repo_root / "Cargo.toml"
|
||
|
|
data = read_toml(cargo_toml_path)
|
||
|
|
return data.get("workspace", {}).get("package", {}).get("version", "4.0.0")
|
||
|
|
|
||
|
|
|
||
|
|
def format_dependency(name: str, dep_spec: object) -> str:
|
||
|
|
"""Format a dependency spec for Cargo.toml."""
|
||
|
|
if isinstance(dep_spec, str):
|
||
|
|
return f'{name} = "{dep_spec}"'
|
||
|
|
elif isinstance(dep_spec, dict):
|
||
|
|
version: str = dep_spec.get("version", "")
|
||
|
|
package: str | None = dep_spec.get("package")
|
||
|
|
features: list[str] = dep_spec.get("features", [])
|
||
|
|
default_features: bool | None = dep_spec.get("default-features")
|
||
|
|
|
||
|
|
optional: bool | None = dep_spec.get("optional")
|
||
|
|
|
||
|
|
path: str | None = dep_spec.get("path")
|
||
|
|
git: str | None = dep_spec.get("git")
|
||
|
|
branch: str | None = dep_spec.get("branch")
|
||
|
|
tag: str | None = dep_spec.get("tag")
|
||
|
|
rev: str | None = dep_spec.get("rev")
|
||
|
|
|
||
|
|
parts: list[str] = []
|
||
|
|
|
||
|
|
if package:
|
||
|
|
parts.append(f'package = "{package}"')
|
||
|
|
|
||
|
|
if git:
|
||
|
|
parts.append(f'git = "{git}"')
|
||
|
|
|
||
|
|
if branch:
|
||
|
|
parts.append(f'branch = "{branch}"')
|
||
|
|
|
||
|
|
if tag:
|
||
|
|
parts.append(f'tag = "{tag}"')
|
||
|
|
|
||
|
|
if rev:
|
||
|
|
parts.append(f'rev = "{rev}"')
|
||
|
|
|
||
|
|
if path:
|
||
|
|
parts.append(f'path = "{path}"')
|
||
|
|
|
||
|
|
if version:
|
||
|
|
parts.append(f'version = "{version}"')
|
||
|
|
|
||
|
|
if features:
|
||
|
|
features_str = ', '.join(f'"{f}"' for f in features)
|
||
|
|
parts.append(f'features = [{features_str}]')
|
||
|
|
|
||
|
|
if default_features is False:
|
||
|
|
parts.append('default-features = false')
|
||
|
|
elif default_features is True:
|
||
|
|
parts.append('default-features = true')
|
||
|
|
|
||
|
|
if optional is True:
|
||
|
|
parts.append('optional = true')
|
||
|
|
elif optional is False:
|
||
|
|
parts.append('optional = false')
|
||
|
|
|
||
|
|
spec_str = ", ".join(parts)
|
||
|
|
return f"{name} = {{ {spec_str} }}"
|
||
|
|
|
||
|
|
return f'{name} = "{dep_spec}"'
|
||
|
|
|
||
|
|
|
||
|
|
def replace_workspace_deps_in_toml(toml_path: Path, workspace_deps: dict[str, object]) -> None:
|
||
|
|
"""Replace workspace = true with explicit versions in a Cargo.toml file."""
|
||
|
|
with open(toml_path, "r") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
for name, dep_spec in workspace_deps.items():
|
||
|
|
pattern1 = rf'^{re.escape(name)} = \{{ workspace = true \}}$'
|
||
|
|
content = re.sub(pattern1, format_dependency(name, dep_spec), content, flags=re.MULTILINE)
|
||
|
|
|
||
|
|
def replace_with_fields(match: re.Match[str]) -> str:
|
||
|
|
other_fields_str = match.group(1).strip()
|
||
|
|
base_spec = format_dependency(name, dep_spec)
|
||
|
|
if " = { " not in base_spec:
|
||
|
|
# Simple string dep like `ctor = "0.6"` - wrap it
|
||
|
|
version_val = base_spec.split(" = ", 1)[1].strip('"')
|
||
|
|
spec_part = f'version = "{version_val}"'
|
||
|
|
else:
|
||
|
|
spec_part = base_spec.split(" = { ", 1)[1].rstrip("} ").rstrip("}")
|
||
|
|
|
||
|
|
# Extract existing keys and values from workspace spec, handling nested brackets
|
||
|
|
workspace_fields: dict[str, str] = {}
|
||
|
|
bracket_depth = 0
|
||
|
|
current_field = ""
|
||
|
|
for char in spec_part:
|
||
|
|
if char == '[':
|
||
|
|
bracket_depth += 1
|
||
|
|
current_field += char
|
||
|
|
elif char == ']':
|
||
|
|
bracket_depth -= 1
|
||
|
|
current_field += char
|
||
|
|
elif char == ',' and bracket_depth == 0:
|
||
|
|
# End of field
|
||
|
|
field = current_field.strip()
|
||
|
|
if field and "=" in field:
|
||
|
|
key, val = field.split("=", 1)
|
||
|
|
workspace_fields[key.strip()] = val.strip()
|
||
|
|
current_field = ""
|
||
|
|
else:
|
||
|
|
current_field += char
|
||
|
|
|
||
|
|
# Don't forget the last field
|
||
|
|
if current_field.strip():
|
||
|
|
field = current_field.strip()
|
||
|
|
if field and "=" in field:
|
||
|
|
key, val = field.split("=", 1)
|
||
|
|
workspace_fields[key.strip()] = val.strip()
|
||
|
|
|
||
|
|
# Extract crate-specific keys using bracket-aware parsing
|
||
|
|
crate_fields: dict[str, str] = {}
|
||
|
|
bracket_depth = 0
|
||
|
|
current_field = ""
|
||
|
|
for char in other_fields_str:
|
||
|
|
if char == '[':
|
||
|
|
bracket_depth += 1
|
||
|
|
current_field += char
|
||
|
|
elif char == ']':
|
||
|
|
bracket_depth -= 1
|
||
|
|
current_field += char
|
||
|
|
elif char == ',' and bracket_depth == 0:
|
||
|
|
# End of field
|
||
|
|
field = current_field.strip()
|
||
|
|
if field and "=" in field:
|
||
|
|
key, val = field.split("=", 1)
|
||
|
|
crate_fields[key.strip()] = val.strip()
|
||
|
|
current_field = ""
|
||
|
|
else:
|
||
|
|
current_field += char
|
||
|
|
|
||
|
|
# Don't forget the last field
|
||
|
|
if current_field.strip():
|
||
|
|
field = current_field.strip()
|
||
|
|
if field and "=" in field:
|
||
|
|
key, val = field.split("=", 1)
|
||
|
|
crate_fields[key.strip()] = val.strip()
|
||
|
|
|
||
|
|
# Merge: crate-specific fields override workspace fields
|
||
|
|
merged_fields = {**workspace_fields, **crate_fields}
|
||
|
|
|
||
|
|
# Build result from merged fields
|
||
|
|
merged_parts = [f"{k} = {v}" for k, v in merged_fields.items()]
|
||
|
|
merged_spec = ", ".join(merged_parts)
|
||
|
|
|
||
|
|
return f"{name} = {{ {merged_spec} }}"
|
||
|
|
|
||
|
|
pattern2 = rf'^{re.escape(name)} = \{{ workspace = true, (.+?) \}}$'
|
||
|
|
content = re.sub(pattern2, replace_with_fields, content, flags=re.MULTILINE | re.DOTALL)
|
||
|
|
|
||
|
|
with open(toml_path, "w") as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
|
||
|
|
def generate_vendor_cargo_toml(repo_root: Path, workspace_deps: dict[str, object], core_version: str, copied_crates: list[str]) -> None:
|
||
|
|
"""Generate vendor/Cargo.toml with workspace setup.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
repo_root: Repository root directory
|
||
|
|
workspace_deps: Workspace dependencies from Cargo.toml
|
||
|
|
core_version: Core version string
|
||
|
|
copied_crates: List of crates that were successfully copied
|
||
|
|
"""
|
||
|
|
|
||
|
|
deps_lines: list[str] = []
|
||
|
|
for name, dep_spec in sorted(workspace_deps.items()):
|
||
|
|
deps_lines.append(format_dependency(name, dep_spec))
|
||
|
|
|
||
|
|
deps_str = "\n".join(deps_lines)
|
||
|
|
|
||
|
|
# Build members list based on actually copied crates
|
||
|
|
members = [name for name in ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "rb-sys"]
|
||
|
|
if name in copied_crates]
|
||
|
|
members_str = ', '.join(f'"{m}"' for m in members)
|
||
|
|
|
||
|
|
vendor_toml = f'''[workspace]
|
||
|
|
members = [{members_str}]
|
||
|
|
|
||
|
|
[workspace.package]
|
||
|
|
version = "{core_version}"
|
||
|
|
edition = "2024"
|
||
|
|
rust-version = "1.91"
|
||
|
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
||
|
|
license = "MIT"
|
||
|
|
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
||
|
|
homepage = "https://kreuzberg.dev"
|
||
|
|
|
||
|
|
[workspace.dependencies]
|
||
|
|
{deps_str}
|
||
|
|
'''
|
||
|
|
|
||
|
|
vendor_dir = repo_root / "packages" / "ruby" / "vendor"
|
||
|
|
vendor_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
toml_path = vendor_dir / "Cargo.toml"
|
||
|
|
with open(toml_path, "w") as f:
|
||
|
|
f.write(vendor_toml)
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
"""Main vendoring function."""
|
||
|
|
repo_root: Path = get_repo_root()
|
||
|
|
|
||
|
|
print("=== Vendoring kreuzberg core crate ===")
|
||
|
|
|
||
|
|
workspace_deps: dict[str, object] = get_workspace_deps(repo_root)
|
||
|
|
core_version: str = get_workspace_version(repo_root)
|
||
|
|
|
||
|
|
print(f"Core version: {core_version}")
|
||
|
|
print(f"Workspace dependencies: {len(workspace_deps)}")
|
||
|
|
|
||
|
|
vendor_base: Path = repo_root / "packages" / "ruby" / "vendor"
|
||
|
|
|
||
|
|
# Clean only crate directories, preserving vendor/bundle/ (Bundler gems)
|
||
|
|
crate_names = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract",
|
||
|
|
"kreuzberg-paddle-ocr", "rb-sys"]
|
||
|
|
for name in crate_names:
|
||
|
|
crate_path = vendor_base / name
|
||
|
|
if crate_path.exists():
|
||
|
|
shutil.rmtree(crate_path)
|
||
|
|
# Also clean the vendor Cargo.toml (will be regenerated)
|
||
|
|
vendor_cargo = vendor_base / "Cargo.toml"
|
||
|
|
if vendor_cargo.exists():
|
||
|
|
vendor_cargo.unlink()
|
||
|
|
print("Cleaned vendor crate directories")
|
||
|
|
|
||
|
|
vendor_base.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
crates_to_copy: list[tuple[str, str]] = [
|
||
|
|
("crates/kreuzberg", "kreuzberg"),
|
||
|
|
("crates/kreuzberg-ffi", "kreuzberg-ffi"),
|
||
|
|
("crates/kreuzberg-tesseract", "kreuzberg-tesseract"),
|
||
|
|
("crates/kreuzberg-paddle-ocr", "kreuzberg-paddle-ocr"),
|
||
|
|
("vendor/rb-sys", "rb-sys"),
|
||
|
|
]
|
||
|
|
|
||
|
|
copied_crates: list[str] = []
|
||
|
|
for src_rel, dest_name in crates_to_copy:
|
||
|
|
src: Path = repo_root / src_rel
|
||
|
|
dest: Path = vendor_base / dest_name
|
||
|
|
if src.exists():
|
||
|
|
try:
|
||
|
|
shutil.copytree(src, dest)
|
||
|
|
copied_crates.append(dest_name)
|
||
|
|
print(f"Copied {dest_name}")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Warning: Failed to copy {dest_name}: {e}", file=sys.stderr)
|
||
|
|
else:
|
||
|
|
print(f"Warning: Source directory not found: {src_rel}")
|
||
|
|
|
||
|
|
artifact_dirs: list[str] = [".fastembed_cache", "target"]
|
||
|
|
temp_patterns: list[str] = ["*.swp", "*.bak", "*.tmp", "*~"]
|
||
|
|
|
||
|
|
for crate_dir in copied_crates:
|
||
|
|
crate_path: Path = vendor_base / crate_dir
|
||
|
|
if crate_path.exists():
|
||
|
|
for artifact_dir in artifact_dirs:
|
||
|
|
artifact: Path = crate_path / artifact_dir
|
||
|
|
if artifact.exists():
|
||
|
|
shutil.rmtree(artifact)
|
||
|
|
|
||
|
|
for pattern in temp_patterns:
|
||
|
|
for f in crate_path.rglob(pattern):
|
||
|
|
f.unlink()
|
||
|
|
|
||
|
|
print("Cleaned build artifacts")
|
||
|
|
|
||
|
|
# Update workspace inheritance in Cargo.toml files
|
||
|
|
for crate_dir in copied_crates:
|
||
|
|
crate_toml = vendor_base / crate_dir / "Cargo.toml"
|
||
|
|
if crate_toml.exists():
|
||
|
|
with open(crate_toml, "r") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
content = re.sub(r'^version\.workspace = true$', f'version = "{core_version}"', content, flags=re.MULTILINE)
|
||
|
|
content = re.sub(r'^edition\.workspace = true$', 'edition = "2024"', content, flags=re.MULTILINE)
|
||
|
|
content = re.sub(r'^rust-version\.workspace = true$', 'rust-version = "1.91"', content, flags=re.MULTILINE)
|
||
|
|
content = re.sub(r'^authors\.workspace = true$', 'authors = ["Na\'aman Hirschfeld <naaman@kreuzberg.dev>"]', content, flags=re.MULTILINE)
|
||
|
|
content = re.sub(r'^license\.workspace = true$', 'license = "MIT"', content, flags=re.MULTILINE)
|
||
|
|
|
||
|
|
with open(crate_toml, "w") as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
replace_workspace_deps_in_toml(crate_toml, workspace_deps)
|
||
|
|
print(f"Updated {crate_dir}/Cargo.toml")
|
||
|
|
|
||
|
|
# Update path dependencies in kreuzberg-ffi crate
|
||
|
|
if "kreuzberg-ffi" in copied_crates and "kreuzberg" in copied_crates:
|
||
|
|
ffi_toml = vendor_base / "kreuzberg-ffi" / "Cargo.toml"
|
||
|
|
if ffi_toml.exists():
|
||
|
|
with open(ffi_toml, "r") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# Replace kreuzberg workspace references with path dependency
|
||
|
|
# Handle cases with path, version, or neither
|
||
|
|
content = re.sub(
|
||
|
|
r'(kreuzberg = \{) (?:(?:path|version) = "[^"]*", )?',
|
||
|
|
r'\1 path = "../kreuzberg", ',
|
||
|
|
content
|
||
|
|
)
|
||
|
|
|
||
|
|
with open(ffi_toml, "w") as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
# Update path dependencies in kreuzberg crate if tesseract was copied
|
||
|
|
if "kreuzberg" in copied_crates:
|
||
|
|
kreuzberg_toml = vendor_base / "kreuzberg" / "Cargo.toml"
|
||
|
|
if kreuzberg_toml.exists():
|
||
|
|
with open(kreuzberg_toml, "r") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# Only update tesseract path if it was actually copied
|
||
|
|
if "kreuzberg-tesseract" in copied_crates:
|
||
|
|
content = re.sub(
|
||
|
|
r'kreuzberg-tesseract = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
|
||
|
|
'kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }',
|
||
|
|
content
|
||
|
|
)
|
||
|
|
# Only update paddle-ocr path if it was actually copied
|
||
|
|
if "kreuzberg-paddle-ocr" in copied_crates:
|
||
|
|
content = re.sub(
|
||
|
|
r'kreuzberg-paddle-ocr = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
|
||
|
|
'kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }',
|
||
|
|
content
|
||
|
|
)
|
||
|
|
|
||
|
|
with open(kreuzberg_toml, "w") as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
generate_vendor_cargo_toml(repo_root, workspace_deps, core_version, copied_crates)
|
||
|
|
print("Generated vendor/Cargo.toml")
|
||
|
|
|
||
|
|
# Update native extension Cargo.toml to use vendored crates
|
||
|
|
native_toml = repo_root / "packages" / "ruby" / "ext" / "kreuzberg_rb" / "native" / "Cargo.toml"
|
||
|
|
if native_toml.exists():
|
||
|
|
with open(native_toml, "r") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# Replace path dependencies to point to vendored crates
|
||
|
|
# From: path = "../../../../../crates/kreuzberg"
|
||
|
|
# To: path = "../../../vendor/kreuzberg"
|
||
|
|
content = re.sub(
|
||
|
|
r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg"',
|
||
|
|
'path = "../../../vendor/kreuzberg"',
|
||
|
|
content
|
||
|
|
)
|
||
|
|
content = re.sub(
|
||
|
|
r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg-ffi"',
|
||
|
|
'path = "../../../vendor/kreuzberg-ffi"',
|
||
|
|
content
|
||
|
|
)
|
||
|
|
|
||
|
|
with open(native_toml, "w") as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
print("Updated native extension Cargo.toml to use vendored crates")
|
||
|
|
|
||
|
|
print(f"\nVendoring complete (core version: {core_version})")
|
||
|
|
print(f"Copied crates: {', '.join(sorted(copied_crates))}")
|
||
|
|
|
||
|
|
if "kreuzberg" in copied_crates and "kreuzberg-ffi" in copied_crates:
|
||
|
|
print("Native extension Cargo.toml uses:")
|
||
|
|
print(" - path '../../../vendor/kreuzberg' for kreuzberg crate")
|
||
|
|
print(" - path '../../../vendor/kreuzberg-ffi' for kreuzberg-ffi crate")
|
||
|
|
if "rb-sys" in copied_crates:
|
||
|
|
print(" - path '../../../vendor/rb-sys' for rb-sys crate")
|
||
|
|
else:
|
||
|
|
print(" - rb-sys from crates.io")
|
||
|
|
else:
|
||
|
|
print("Warning: Some required crates were not copied. Check for missing source directories.")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
try:
|
||
|
|
main()
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error: {e}", file=sys.stderr)
|
||
|
|
sys.exit(1)
|