Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,119 @@
//! Core extraction service implementing [`tower::Service`].
use crate::core::config::ExtractionConfig;
use crate::core::extractor::{extract_bytes, extract_file};
use crate::types::ExtractionResult;
use crate::{KreuzbergError, Result};
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll};
use tower::Service;
use super::request::{ExtractionRequest, ExtractionSource};
/// A [`tower::Service`] that dispatches extraction requests to the kreuzberg
/// core library.
///
/// This service is cheap to clone and can be shared across handlers.
/// Concurrency and timeouts are managed by composing Tower layers on top
/// (see [`super::ExtractionServiceBuilder`]).
///
/// # Example
///
/// ```rust,ignore
/// use kreuzberg::service::{ExtractionService, ExtractionRequest};
/// use kreuzberg::ExtractionConfig;
/// use tower::Service;
///
/// let mut svc = ExtractionService::new();
/// let req = ExtractionRequest::file("doc.pdf", ExtractionConfig::default());
/// let result = svc.call(req).await?;
/// ```
#[cfg_attr(alef, alef(skip))]
#[derive(Debug, Clone)]
pub struct ExtractionService {
_private: (),
}
impl ExtractionService {
/// Create a new extraction service.
pub(crate) fn new() -> Self {
Self { _private: () }
}
}
impl Default for ExtractionService {
fn default() -> Self {
Self::new()
}
}
impl Service<ExtractionRequest> for ExtractionService {
type Response = ExtractionResult;
type Error = KreuzbergError;
type Future = Pin<Box<dyn Future<Output = Result<ExtractionResult>> + Send>>;
fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<()>> {
Poll::Ready(Ok(()))
}
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
let config = resolve_config(req.config, req.file_overrides);
match req.source {
ExtractionSource::File { path, mime_hint } => {
Box::pin(async move { extract_file(&path, mime_hint.as_deref(), &config).await })
}
ExtractionSource::Bytes { data, mime_type } => {
Box::pin(async move { extract_bytes(&data, &mime_type, &config).await })
}
}
}
}
/// Merge optional per-file overrides into the base config.
fn resolve_config(
base: ExtractionConfig,
overrides: Option<crate::core::config::FileExtractionConfig>,
) -> ExtractionConfig {
match overrides {
Some(file_overrides) => base.with_file_overrides(&file_overrides),
None => base,
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use std::task::Poll;
use tower::Service;
#[test]
fn poll_ready_returns_ready() {
let mut svc = ExtractionService::new();
let waker = std::task::Waker::noop();
let mut cx = Context::from_waker(waker);
assert!(matches!(svc.poll_ready(&mut cx), Poll::Ready(Ok(()))));
}
#[tokio::test]
async fn extract_plain_text_bytes() {
let mut svc = ExtractionService::new();
let req = ExtractionRequest::bytes(b"hello".as_slice(), "text/plain", ExtractionConfig::default());
let result = svc.call(req).await.expect("extraction should succeed");
assert!(result.content.contains("hello"));
}
#[tokio::test]
async fn extract_from_tempfile() {
let mut svc = ExtractionService::new();
let mut tmp = tempfile::NamedTempFile::new().expect("failed to create tempfile");
tmp.write_all(b"tempfile content").expect("failed to write");
tmp.flush().expect("failed to flush");
let req = ExtractionRequest::file_with_mime(tmp.path(), "text/plain", ExtractionConfig::default());
let result = svc.call(req).await.expect("extraction should succeed");
assert!(result.content.contains("tempfile content"));
}
}

View File

@@ -0,0 +1,88 @@
//! Metrics layer for the extraction service.
//!
//! Records service-level counters, histograms, and gauges on every
//! extraction request using the kreuzberg OTel metric instruments.
use crate::types::ExtractionResult;
use crate::{KreuzbergError, Result};
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll};
use tower::{Layer, Service};
use crate::service::request::{ExtractionRequest, ExtractionSource};
use crate::telemetry::conventions;
// ---------------------------------------------------------------------------
// Layer
// ---------------------------------------------------------------------------
/// A [`tower::Layer`] that records service-level extraction metrics.
#[cfg_attr(alef, alef(skip))]
#[derive(Debug, Clone, Default)]
pub struct MetricsLayer;
impl MetricsLayer {
pub(crate) fn new() -> Self {
Self
}
}
impl<S> Layer<S> for MetricsLayer {
type Service = MetricsService<S>;
fn layer(&self, inner: S) -> Self::Service {
MetricsService { inner }
}
}
// ---------------------------------------------------------------------------
// Service
// ---------------------------------------------------------------------------
/// Middleware service that records extraction metrics.
#[derive(Debug, Clone)]
pub struct MetricsService<S> {
inner: S,
}
impl<S> Service<ExtractionRequest> for MetricsService<S>
where
S: Service<ExtractionRequest, Response = ExtractionResult, Error = KreuzbergError> + Clone + Send + 'static,
S::Future: Send,
{
type Response = ExtractionResult;
type Error = KreuzbergError;
type Future = Pin<Box<dyn Future<Output = Result<ExtractionResult>> + Send>>;
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
self.inner.poll_ready(cx)
}
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
let metrics = crate::telemetry::metrics::get_metrics();
let mime_type = match &req.source {
ExtractionSource::File { .. } => "unknown".to_owned(),
ExtractionSource::Bytes { mime_type, .. } => mime_type.clone(),
};
metrics.concurrent_extractions.add(1, &[]);
let mut inner = self.inner.clone();
Box::pin(async move {
let result = inner.call(req).await;
let status = if result.is_ok() { "ok" } else { "error" };
let attrs = [
opentelemetry::KeyValue::new(conventions::DOCUMENT_MIME_TYPE, mime_type),
opentelemetry::KeyValue::new("status", status),
];
metrics.extraction_total.add(1, &attrs);
metrics.concurrent_extractions.add(-1, &[]);
result
})
}
}

View File

@@ -0,0 +1,6 @@
//! Tower middleware layers for the extraction service.
pub mod tracing;
#[cfg(feature = "otel")]
pub mod metrics;

View File

@@ -0,0 +1,107 @@
//! Tracing layer for the extraction service.
//!
//! Adds a semantic span to every extraction request using kreuzberg conventions.
use crate::telemetry::conventions;
use crate::types::ExtractionResult;
use crate::{KreuzbergError, Result};
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll};
use tower::{Layer, Service};
use tracing::Instrument;
use crate::service::request::{ExtractionRequest, ExtractionSource};
// ---------------------------------------------------------------------------
// Layer
// ---------------------------------------------------------------------------
/// A [`tower::Layer`] that wraps each extraction in a semantic tracing span.
#[cfg_attr(alef, alef(skip))]
#[derive(Debug, Clone, Default)]
pub struct TracingLayer;
impl TracingLayer {
pub(crate) fn new() -> Self {
Self
}
}
impl<S> Layer<S> for TracingLayer {
type Service = TracingService<S>;
#[cfg_attr(alef, alef(skip))]
fn layer(&self, inner: S) -> Self::Service {
TracingService { inner }
}
}
// ---------------------------------------------------------------------------
// Service
// ---------------------------------------------------------------------------
/// Middleware service that creates a span per extraction request.
#[derive(Debug, Clone)]
pub struct TracingService<S> {
inner: S,
}
impl<S> Service<ExtractionRequest> for TracingService<S>
where
S: Service<ExtractionRequest, Response = ExtractionResult, Error = KreuzbergError> + Clone + Send + 'static,
S::Future: Send,
{
type Response = ExtractionResult;
type Error = KreuzbergError;
type Future = Pin<Box<dyn Future<Output = Result<ExtractionResult>> + Send>>;
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
self.inner.poll_ready(cx)
}
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
let span = make_span(&req);
let mut inner = self.inner.clone();
Box::pin(
async move {
let result = inner.call(req).await;
#[cfg(feature = "otel")]
match &result {
Ok(_) => crate::telemetry::spans::record_success_on_current_span(),
Err(e) => crate::telemetry::spans::record_error_on_current_span(e),
}
result
}
.instrument(span),
)
}
}
fn make_span(req: &ExtractionRequest) -> tracing::Span {
match &req.source {
ExtractionSource::File { path, .. } => {
let filename = conventions::sanitize_filename(path);
tracing::info_span!(
"kreuzberg.service",
{ conventions::OPERATION } = conventions::operations::EXTRACT_FILE,
{ conventions::DOCUMENT_FILENAME } = filename,
{ conventions::OTEL_STATUS_CODE } = tracing::field::Empty,
{ conventions::ERROR_TYPE } = tracing::field::Empty,
{ conventions::ERROR_MESSAGE } = tracing::field::Empty,
)
}
ExtractionSource::Bytes { mime_type, data } => tracing::info_span!(
"kreuzberg.service",
{ conventions::OPERATION } = conventions::operations::EXTRACT_BYTES,
{ conventions::DOCUMENT_MIME_TYPE } = %mime_type,
{ conventions::DOCUMENT_SIZE_BYTES } = data.len(),
{ conventions::OTEL_STATUS_CODE } = tracing::field::Empty,
{ conventions::ERROR_TYPE } = tracing::field::Empty,
{ conventions::ERROR_MESSAGE } = tracing::field::Empty,
),
}
}

View File

@@ -0,0 +1,257 @@
//! Tower service layer for kreuzberg extraction.
//!
//! Provides a composable [`tower::Service`] that wraps the core extraction
//! functions with configurable middleware layers (tracing, metrics, timeout,
//! concurrency limits).
//!
//! # Architecture
//!
//! ```text
//! TracingLayer → MetricsLayer → Timeout → ConcurrencyLimit → ExtractionService
//! ```
//!
//! # Example
//!
//! ```rust,ignore
//! use kreuzberg::service::{ExtractionServiceBuilder, ExtractionRequest};
//! use kreuzberg::ExtractionConfig;
//! use tower::Service;
//! use std::time::Duration;
//!
//! let mut svc = ExtractionServiceBuilder::new()
//! .with_timeout(Duration::from_secs(300))
//! .with_concurrency_limit(4)
//! .build();
//!
//! let req = ExtractionRequest::file("doc.pdf", ExtractionConfig::default());
//! let result = svc.call(req).await?;
//! ```
mod extraction;
pub mod layers;
pub mod request;
pub use extraction::ExtractionService;
pub use request::{ExtractionRequest, ExtractionSource};
use crate::KreuzbergError;
use crate::types::ExtractionResult;
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll};
use std::time::Duration;
use tower::util::BoxCloneService;
use tower::{Service, ServiceBuilder, ServiceExt};
/// Builder for composing an extraction service with Tower middleware layers.
///
/// Layers are applied in the order: Tracing → Metrics → Timeout → ConcurrencyLimit → Service.
#[cfg_attr(alef, alef(skip))]
pub struct ExtractionServiceBuilder {
timeout: Option<Duration>,
concurrency_limit: Option<usize>,
tracing: bool,
#[cfg(feature = "otel")]
metrics: bool,
}
impl Default for ExtractionServiceBuilder {
fn default() -> Self {
Self::new()
}
}
impl ExtractionServiceBuilder {
/// Create a new builder with no layers configured.
pub(crate) fn new() -> Self {
Self {
timeout: None,
concurrency_limit: None,
tracing: false,
#[cfg(feature = "otel")]
metrics: false,
}
}
/// Add a per-request timeout.
#[cfg(test)]
pub(crate) fn with_timeout(mut self, duration: Duration) -> Self {
self.timeout = Some(duration);
self
}
/// Limit concurrent in-flight extractions.
#[cfg(test)]
pub(crate) fn with_concurrency_limit(mut self, max: usize) -> Self {
self.concurrency_limit = Some(max);
self
}
/// Add a tracing span to each extraction request.
pub(crate) fn with_tracing(mut self) -> Self {
self.tracing = true;
self
}
/// Add metrics recording to each extraction request.
///
/// Requires the `otel` feature. This is a no-op when `otel` is not enabled.
#[allow(unused_mut)]
pub(crate) fn with_metrics(mut self) -> Self {
#[cfg(feature = "otel")]
{
self.metrics = true;
}
self
}
/// Build the service stack, returning a type-erased cloneable service.
///
/// Layer order (outermost to innermost):
/// `Tracing → Metrics → Timeout → ConcurrencyLimit → ExtractionService`
pub(crate) fn build(self) -> BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError> {
let svc = ExtractionService::new();
// Apply concurrency limit (innermost optional layer).
let svc = match self.concurrency_limit {
Some(limit) => ServiceBuilder::new()
.concurrency_limit(limit)
.service(svc)
.boxed_clone(),
None => svc.boxed_clone(),
};
// Apply timeout. We wrap inline rather than using Tower's Timeout layer
// because Timeout changes the error type to BoxError — we need to keep
// KreuzbergError throughout the stack.
let svc: BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError> = match self.timeout {
Some(duration) => {
let timeout_svc = TimeoutService { inner: svc, duration };
timeout_svc.boxed_clone()
}
None => svc,
};
// Apply metrics layer (otel only).
#[cfg(feature = "otel")]
let svc = if self.metrics {
ServiceBuilder::new()
.layer(layers::metrics::MetricsLayer::new())
.service(svc)
.boxed_clone()
} else {
svc
};
// Apply tracing layer (outermost).
if self.tracing {
ServiceBuilder::new()
.layer(layers::tracing::TracingLayer::new())
.service(svc)
.boxed_clone()
} else {
svc
}
}
}
// ---------------------------------------------------------------------------
// Timeout wrapper that preserves KreuzbergError
// ---------------------------------------------------------------------------
/// A simple timeout wrapper that converts elapsed timeouts to
/// [`KreuzbergError::Timeout`] instead of a `BoxError`.
#[derive(Clone)]
struct TimeoutService {
inner: BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError>,
duration: Duration,
}
impl Service<ExtractionRequest> for TimeoutService {
type Response = ExtractionResult;
type Error = KreuzbergError;
type Future = Pin<Box<dyn Future<Output = crate::Result<ExtractionResult>> + Send>>;
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<crate::Result<()>> {
self.inner.poll_ready(cx)
}
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
let fut = self.inner.call(req);
let duration = self.duration;
let start = std::time::Instant::now();
Box::pin(async move {
match tokio::time::timeout(duration, fut).await {
Ok(result) => result,
Err(_elapsed) => Err(KreuzbergError::Timeout {
elapsed_ms: start.elapsed().as_millis() as u64,
limit_ms: duration.as_millis() as u64,
}),
}
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::config::ExtractionConfig;
#[test]
fn builder_new_builds_service() {
// Should not panic.
let _svc = ExtractionServiceBuilder::new().build();
}
#[test]
fn builder_with_timeout_does_not_panic() {
let _svc = ExtractionServiceBuilder::new()
.with_timeout(Duration::from_secs(30))
.build();
}
#[test]
fn builder_with_concurrency_limit_does_not_panic() {
let _svc = ExtractionServiceBuilder::new().with_concurrency_limit(4).build();
}
#[tokio::test]
async fn builder_service_extracts_text() {
let mut svc = ExtractionServiceBuilder::new().build();
let req = ExtractionRequest::bytes(
b"hello from builder".as_slice(),
"text/plain",
ExtractionConfig::default(),
);
let result = svc.call(req).await.expect("extraction should succeed");
assert!(result.content.contains("hello from builder"));
}
#[tokio::test]
async fn builder_with_timeout_extracts_text() {
let mut svc = ExtractionServiceBuilder::new()
.with_timeout(Duration::from_secs(10))
.build();
let req = ExtractionRequest::bytes(b"timeout test".as_slice(), "text/plain", ExtractionConfig::default());
let result = svc.call(req).await.expect("extraction should succeed within timeout");
assert!(result.content.contains("timeout test"));
}
#[tokio::test]
async fn timeout_fires_on_zero_duration() {
let mut svc = ExtractionServiceBuilder::new()
.with_timeout(Duration::from_nanos(1))
.build();
let req = ExtractionRequest::bytes(b"hello".as_slice(), "text/plain", ExtractionConfig::default());
let result = svc.call(req).await;
// With a 1ns timeout, the result is either a success (if extraction
// completes before the timeout is checked) or a Timeout error.
// Both are acceptable — the key assertion is that it does not panic.
match result {
Ok(r) => assert!(r.content.contains("hello")),
Err(KreuzbergError::Timeout { .. }) => { /* expected timeout */ }
Err(other) => panic!("expected Ok or Timeout, got: {:?}", other),
}
}
}

View File

@@ -0,0 +1,127 @@
//! Request and response types for the extraction service.
use crate::core::config::{ExtractionConfig, FileExtractionConfig};
use bytes::Bytes;
use std::path::PathBuf;
#[cfg_attr(alef, alef(skip))]
/// The source of a document to extract.
#[derive(Debug, Clone)]
pub enum ExtractionSource {
/// Extract from a filesystem path with an optional MIME type hint.
File { path: PathBuf, mime_hint: Option<String> },
/// Extract from in-memory bytes with a known MIME type.
Bytes { data: Bytes, mime_type: String },
}
#[cfg_attr(alef, alef(skip))]
/// A request to extract content from a single document.
#[derive(Debug, Clone)]
pub struct ExtractionRequest {
/// Where to read the document from.
pub source: ExtractionSource,
/// Base extraction configuration.
pub config: ExtractionConfig,
/// Optional per-file overrides (merged on top of `config`).
pub file_overrides: Option<FileExtractionConfig>,
}
impl ExtractionRequest {
/// Create a file-based extraction request.
#[cfg(feature = "mcp")]
pub(crate) fn file(path: impl Into<PathBuf>, config: ExtractionConfig) -> Self {
Self {
source: ExtractionSource::File {
path: path.into(),
mime_hint: None,
},
config,
file_overrides: None,
}
}
/// Create a file-based extraction request with a MIME type hint.
#[cfg(feature = "mcp")]
pub(crate) fn file_with_mime(
path: impl Into<PathBuf>,
mime_hint: impl Into<String>,
config: ExtractionConfig,
) -> Self {
Self {
source: ExtractionSource::File {
path: path.into(),
mime_hint: Some(mime_hint.into()),
},
config,
file_overrides: None,
}
}
/// Create a bytes-based extraction request.
pub(crate) fn bytes(data: impl Into<Bytes>, mime_type: impl Into<String>, config: ExtractionConfig) -> Self {
Self {
source: ExtractionSource::Bytes {
data: data.into(),
mime_type: mime_type.into(),
},
config,
file_overrides: None,
}
}
/// Set per-file overrides on this request.
#[cfg(test)]
pub(crate) fn with_overrides(mut self, overrides: FileExtractionConfig) -> Self {
self.file_overrides = Some(overrides);
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "mcp")]
#[test]
fn file_creates_file_source() {
let req = ExtractionRequest::file("/tmp/doc.pdf", ExtractionConfig::default());
match &req.source {
ExtractionSource::File { path, mime_hint } => {
assert_eq!(path, &PathBuf::from("/tmp/doc.pdf"));
assert!(mime_hint.is_none());
}
_ => panic!("expected File source"),
}
assert!(req.file_overrides.is_none());
}
#[test]
fn bytes_creates_bytes_source() {
let req = ExtractionRequest::bytes(b"hello".as_slice(), "text/plain", ExtractionConfig::default());
match &req.source {
ExtractionSource::Bytes { data, mime_type } => {
assert_eq!(data.as_ref(), b"hello");
assert_eq!(mime_type, "text/plain");
}
_ => panic!("expected Bytes source"),
}
}
#[cfg(feature = "mcp")]
#[test]
fn file_with_mime_sets_hint() {
let req = ExtractionRequest::file_with_mime("/tmp/doc.pdf", "application/pdf", ExtractionConfig::default());
match &req.source {
ExtractionSource::File { mime_hint, .. } => {
assert_eq!(mime_hint.as_deref(), Some("application/pdf"));
}
_ => panic!("expected File source"),
}
}
#[cfg(feature = "mcp")]
#[test]
fn with_overrides_sets_file_overrides() {
let overrides = FileExtractionConfig::default();
let req = ExtractionRequest::file("/tmp/doc.pdf", ExtractionConfig::default()).with_overrides(overrides);
assert!(req.file_overrides.is_some());
}
}