Files
fil/crates/kreuzberg/src/model_download.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

73 lines
2.4 KiB
Rust

//! Shared utilities for downloading and verifying ONNX models from HuggingFace Hub.
//!
//! Used by both layout detection and PaddleOCR model managers.
use std::io::{BufReader, Read};
use std::path::{Path, PathBuf};
use sha2::{Digest, Sha256};
/// Download a file from a HuggingFace Hub repository.
///
/// Uses `hf-hub`'s built-in caching so repeated calls for the same file are fast.
pub(crate) fn hf_download(repo_id: &str, remote_filename: &str) -> Result<PathBuf, String> {
tracing::info!(repo = repo_id, filename = remote_filename, "Downloading via hf-hub");
let api = hf_hub::api::sync::ApiBuilder::from_env()
.with_progress(true)
.build()
.map_err(|e| format!("Failed to initialize HuggingFace Hub API: {e}"))?;
let repo = api.model(repo_id.to_string());
let cached_path = repo
.get(remote_filename)
.map_err(|e| format!("Failed to download '{remote_filename}' from {repo_id}: {e}"))?;
Ok(cached_path)
}
/// Verify the SHA256 checksum of a file using streaming reads.
///
/// Streams the file in 64 KiB chunks to avoid loading large model files (100MB+) entirely
/// into memory. Returns `Ok(())` if the checksum matches or is empty (skip verification).
pub(crate) fn verify_sha256(path: &Path, expected: &str, label: &str) -> Result<(), String> {
if expected.is_empty() {
return Ok(());
}
let file = std::fs::File::open(path).map_err(|e| format!("Failed to open file for checksum: {e}"))?;
let mut reader = BufReader::with_capacity(64 * 1024, file);
let mut hasher = Sha256::new();
let mut buf = [0u8; 64 * 1024];
loop {
let n = reader
.read(&mut buf)
.map_err(|e| format!("Failed to read file for checksum: {e}"))?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
let hash_hex = hex::encode(hasher.finalize());
if hash_hex != expected {
return Err(format!(
"Checksum mismatch for {label}: expected {expected}, got {hash_hex}"
));
}
tracing::debug!(label, "Checksum verified");
Ok(())
}
/// Resolve the kreuzberg cache directory for a given module.
///
/// Delegates to [`crate::cache_dir::resolve_cache_dir`] for centralized,
/// platform-aware cache directory resolution.
#[cfg(feature = "layout-detection")]
pub(crate) fn resolve_cache_dir(module: &str) -> PathBuf {
crate::cache_dir::resolve_cache_dir(module)
}