This commit is contained in:
119
crates/kreuzberg/src/service/extraction.rs
Normal file
119
crates/kreuzberg/src/service/extraction.rs
Normal file
@@ -0,0 +1,119 @@
|
||||
//! Core extraction service implementing [`tower::Service`].
|
||||
|
||||
use crate::core::config::ExtractionConfig;
|
||||
use crate::core::extractor::{extract_bytes, extract_file};
|
||||
use crate::types::ExtractionResult;
|
||||
use crate::{KreuzbergError, Result};
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use tower::Service;
|
||||
|
||||
use super::request::{ExtractionRequest, ExtractionSource};
|
||||
|
||||
/// A [`tower::Service`] that dispatches extraction requests to the kreuzberg
|
||||
/// core library.
|
||||
///
|
||||
/// This service is cheap to clone and can be shared across handlers.
|
||||
/// Concurrency and timeouts are managed by composing Tower layers on top
|
||||
/// (see [`super::ExtractionServiceBuilder`]).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// use kreuzberg::service::{ExtractionService, ExtractionRequest};
|
||||
/// use kreuzberg::ExtractionConfig;
|
||||
/// use tower::Service;
|
||||
///
|
||||
/// let mut svc = ExtractionService::new();
|
||||
/// let req = ExtractionRequest::file("doc.pdf", ExtractionConfig::default());
|
||||
/// let result = svc.call(req).await?;
|
||||
/// ```
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExtractionService {
|
||||
_private: (),
|
||||
}
|
||||
|
||||
impl ExtractionService {
|
||||
/// Create a new extraction service.
|
||||
pub(crate) fn new() -> Self {
|
||||
Self { _private: () }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExtractionService {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Service<ExtractionRequest> for ExtractionService {
|
||||
type Response = ExtractionResult;
|
||||
type Error = KreuzbergError;
|
||||
type Future = Pin<Box<dyn Future<Output = Result<ExtractionResult>> + Send>>;
|
||||
|
||||
fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<()>> {
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
|
||||
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
|
||||
let config = resolve_config(req.config, req.file_overrides);
|
||||
|
||||
match req.source {
|
||||
ExtractionSource::File { path, mime_hint } => {
|
||||
Box::pin(async move { extract_file(&path, mime_hint.as_deref(), &config).await })
|
||||
}
|
||||
ExtractionSource::Bytes { data, mime_type } => {
|
||||
Box::pin(async move { extract_bytes(&data, &mime_type, &config).await })
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge optional per-file overrides into the base config.
|
||||
fn resolve_config(
|
||||
base: ExtractionConfig,
|
||||
overrides: Option<crate::core::config::FileExtractionConfig>,
|
||||
) -> ExtractionConfig {
|
||||
match overrides {
|
||||
Some(file_overrides) => base.with_file_overrides(&file_overrides),
|
||||
None => base,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Write;
|
||||
use std::task::Poll;
|
||||
use tower::Service;
|
||||
|
||||
#[test]
|
||||
fn poll_ready_returns_ready() {
|
||||
let mut svc = ExtractionService::new();
|
||||
let waker = std::task::Waker::noop();
|
||||
let mut cx = Context::from_waker(waker);
|
||||
assert!(matches!(svc.poll_ready(&mut cx), Poll::Ready(Ok(()))));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn extract_plain_text_bytes() {
|
||||
let mut svc = ExtractionService::new();
|
||||
let req = ExtractionRequest::bytes(b"hello".as_slice(), "text/plain", ExtractionConfig::default());
|
||||
let result = svc.call(req).await.expect("extraction should succeed");
|
||||
assert!(result.content.contains("hello"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn extract_from_tempfile() {
|
||||
let mut svc = ExtractionService::new();
|
||||
let mut tmp = tempfile::NamedTempFile::new().expect("failed to create tempfile");
|
||||
tmp.write_all(b"tempfile content").expect("failed to write");
|
||||
tmp.flush().expect("failed to flush");
|
||||
|
||||
let req = ExtractionRequest::file_with_mime(tmp.path(), "text/plain", ExtractionConfig::default());
|
||||
let result = svc.call(req).await.expect("extraction should succeed");
|
||||
assert!(result.content.contains("tempfile content"));
|
||||
}
|
||||
}
|
||||
88
crates/kreuzberg/src/service/layers/metrics.rs
Normal file
88
crates/kreuzberg/src/service/layers/metrics.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
//! Metrics layer for the extraction service.
|
||||
//!
|
||||
//! Records service-level counters, histograms, and gauges on every
|
||||
//! extraction request using the kreuzberg OTel metric instruments.
|
||||
|
||||
use crate::types::ExtractionResult;
|
||||
use crate::{KreuzbergError, Result};
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use tower::{Layer, Service};
|
||||
|
||||
use crate::service::request::{ExtractionRequest, ExtractionSource};
|
||||
use crate::telemetry::conventions;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Layer
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// A [`tower::Layer`] that records service-level extraction metrics.
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct MetricsLayer;
|
||||
|
||||
impl MetricsLayer {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> Layer<S> for MetricsLayer {
|
||||
type Service = MetricsService<S>;
|
||||
|
||||
fn layer(&self, inner: S) -> Self::Service {
|
||||
MetricsService { inner }
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Service
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Middleware service that records extraction metrics.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricsService<S> {
|
||||
inner: S,
|
||||
}
|
||||
|
||||
impl<S> Service<ExtractionRequest> for MetricsService<S>
|
||||
where
|
||||
S: Service<ExtractionRequest, Response = ExtractionResult, Error = KreuzbergError> + Clone + Send + 'static,
|
||||
S::Future: Send,
|
||||
{
|
||||
type Response = ExtractionResult;
|
||||
type Error = KreuzbergError;
|
||||
type Future = Pin<Box<dyn Future<Output = Result<ExtractionResult>> + Send>>;
|
||||
|
||||
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
|
||||
self.inner.poll_ready(cx)
|
||||
}
|
||||
|
||||
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
|
||||
let metrics = crate::telemetry::metrics::get_metrics();
|
||||
let mime_type = match &req.source {
|
||||
ExtractionSource::File { .. } => "unknown".to_owned(),
|
||||
ExtractionSource::Bytes { mime_type, .. } => mime_type.clone(),
|
||||
};
|
||||
|
||||
metrics.concurrent_extractions.add(1, &[]);
|
||||
|
||||
let mut inner = self.inner.clone();
|
||||
|
||||
Box::pin(async move {
|
||||
let result = inner.call(req).await;
|
||||
|
||||
let status = if result.is_ok() { "ok" } else { "error" };
|
||||
let attrs = [
|
||||
opentelemetry::KeyValue::new(conventions::DOCUMENT_MIME_TYPE, mime_type),
|
||||
opentelemetry::KeyValue::new("status", status),
|
||||
];
|
||||
|
||||
metrics.extraction_total.add(1, &attrs);
|
||||
metrics.concurrent_extractions.add(-1, &[]);
|
||||
|
||||
result
|
||||
})
|
||||
}
|
||||
}
|
||||
6
crates/kreuzberg/src/service/layers/mod.rs
Normal file
6
crates/kreuzberg/src/service/layers/mod.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
//! Tower middleware layers for the extraction service.
|
||||
|
||||
pub mod tracing;
|
||||
|
||||
#[cfg(feature = "otel")]
|
||||
pub mod metrics;
|
||||
107
crates/kreuzberg/src/service/layers/tracing.rs
Normal file
107
crates/kreuzberg/src/service/layers/tracing.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
//! Tracing layer for the extraction service.
|
||||
//!
|
||||
//! Adds a semantic span to every extraction request using kreuzberg conventions.
|
||||
|
||||
use crate::telemetry::conventions;
|
||||
use crate::types::ExtractionResult;
|
||||
use crate::{KreuzbergError, Result};
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use tower::{Layer, Service};
|
||||
use tracing::Instrument;
|
||||
|
||||
use crate::service::request::{ExtractionRequest, ExtractionSource};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Layer
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// A [`tower::Layer`] that wraps each extraction in a semantic tracing span.
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct TracingLayer;
|
||||
|
||||
impl TracingLayer {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> Layer<S> for TracingLayer {
|
||||
type Service = TracingService<S>;
|
||||
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
fn layer(&self, inner: S) -> Self::Service {
|
||||
TracingService { inner }
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Service
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Middleware service that creates a span per extraction request.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TracingService<S> {
|
||||
inner: S,
|
||||
}
|
||||
|
||||
impl<S> Service<ExtractionRequest> for TracingService<S>
|
||||
where
|
||||
S: Service<ExtractionRequest, Response = ExtractionResult, Error = KreuzbergError> + Clone + Send + 'static,
|
||||
S::Future: Send,
|
||||
{
|
||||
type Response = ExtractionResult;
|
||||
type Error = KreuzbergError;
|
||||
type Future = Pin<Box<dyn Future<Output = Result<ExtractionResult>> + Send>>;
|
||||
|
||||
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
|
||||
self.inner.poll_ready(cx)
|
||||
}
|
||||
|
||||
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
|
||||
let span = make_span(&req);
|
||||
let mut inner = self.inner.clone();
|
||||
|
||||
Box::pin(
|
||||
async move {
|
||||
let result = inner.call(req).await;
|
||||
|
||||
#[cfg(feature = "otel")]
|
||||
match &result {
|
||||
Ok(_) => crate::telemetry::spans::record_success_on_current_span(),
|
||||
Err(e) => crate::telemetry::spans::record_error_on_current_span(e),
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
.instrument(span),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn make_span(req: &ExtractionRequest) -> tracing::Span {
|
||||
match &req.source {
|
||||
ExtractionSource::File { path, .. } => {
|
||||
let filename = conventions::sanitize_filename(path);
|
||||
tracing::info_span!(
|
||||
"kreuzberg.service",
|
||||
{ conventions::OPERATION } = conventions::operations::EXTRACT_FILE,
|
||||
{ conventions::DOCUMENT_FILENAME } = filename,
|
||||
{ conventions::OTEL_STATUS_CODE } = tracing::field::Empty,
|
||||
{ conventions::ERROR_TYPE } = tracing::field::Empty,
|
||||
{ conventions::ERROR_MESSAGE } = tracing::field::Empty,
|
||||
)
|
||||
}
|
||||
ExtractionSource::Bytes { mime_type, data } => tracing::info_span!(
|
||||
"kreuzberg.service",
|
||||
{ conventions::OPERATION } = conventions::operations::EXTRACT_BYTES,
|
||||
{ conventions::DOCUMENT_MIME_TYPE } = %mime_type,
|
||||
{ conventions::DOCUMENT_SIZE_BYTES } = data.len(),
|
||||
{ conventions::OTEL_STATUS_CODE } = tracing::field::Empty,
|
||||
{ conventions::ERROR_TYPE } = tracing::field::Empty,
|
||||
{ conventions::ERROR_MESSAGE } = tracing::field::Empty,
|
||||
),
|
||||
}
|
||||
}
|
||||
257
crates/kreuzberg/src/service/mod.rs
Normal file
257
crates/kreuzberg/src/service/mod.rs
Normal file
@@ -0,0 +1,257 @@
|
||||
//! Tower service layer for kreuzberg extraction.
|
||||
//!
|
||||
//! Provides a composable [`tower::Service`] that wraps the core extraction
|
||||
//! functions with configurable middleware layers (tracing, metrics, timeout,
|
||||
//! concurrency limits).
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! ```text
|
||||
//! TracingLayer → MetricsLayer → Timeout → ConcurrencyLimit → ExtractionService
|
||||
//! ```
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```rust,ignore
|
||||
//! use kreuzberg::service::{ExtractionServiceBuilder, ExtractionRequest};
|
||||
//! use kreuzberg::ExtractionConfig;
|
||||
//! use tower::Service;
|
||||
//! use std::time::Duration;
|
||||
//!
|
||||
//! let mut svc = ExtractionServiceBuilder::new()
|
||||
//! .with_timeout(Duration::from_secs(300))
|
||||
//! .with_concurrency_limit(4)
|
||||
//! .build();
|
||||
//!
|
||||
//! let req = ExtractionRequest::file("doc.pdf", ExtractionConfig::default());
|
||||
//! let result = svc.call(req).await?;
|
||||
//! ```
|
||||
|
||||
mod extraction;
|
||||
pub mod layers;
|
||||
pub mod request;
|
||||
|
||||
pub use extraction::ExtractionService;
|
||||
pub use request::{ExtractionRequest, ExtractionSource};
|
||||
|
||||
use crate::KreuzbergError;
|
||||
use crate::types::ExtractionResult;
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::Duration;
|
||||
use tower::util::BoxCloneService;
|
||||
use tower::{Service, ServiceBuilder, ServiceExt};
|
||||
|
||||
/// Builder for composing an extraction service with Tower middleware layers.
|
||||
///
|
||||
/// Layers are applied in the order: Tracing → Metrics → Timeout → ConcurrencyLimit → Service.
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
pub struct ExtractionServiceBuilder {
|
||||
timeout: Option<Duration>,
|
||||
concurrency_limit: Option<usize>,
|
||||
tracing: bool,
|
||||
#[cfg(feature = "otel")]
|
||||
metrics: bool,
|
||||
}
|
||||
|
||||
impl Default for ExtractionServiceBuilder {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ExtractionServiceBuilder {
|
||||
/// Create a new builder with no layers configured.
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
timeout: None,
|
||||
concurrency_limit: None,
|
||||
tracing: false,
|
||||
#[cfg(feature = "otel")]
|
||||
metrics: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a per-request timeout.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn with_timeout(mut self, duration: Duration) -> Self {
|
||||
self.timeout = Some(duration);
|
||||
self
|
||||
}
|
||||
|
||||
/// Limit concurrent in-flight extractions.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn with_concurrency_limit(mut self, max: usize) -> Self {
|
||||
self.concurrency_limit = Some(max);
|
||||
self
|
||||
}
|
||||
|
||||
/// Add a tracing span to each extraction request.
|
||||
pub(crate) fn with_tracing(mut self) -> Self {
|
||||
self.tracing = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Add metrics recording to each extraction request.
|
||||
///
|
||||
/// Requires the `otel` feature. This is a no-op when `otel` is not enabled.
|
||||
#[allow(unused_mut)]
|
||||
pub(crate) fn with_metrics(mut self) -> Self {
|
||||
#[cfg(feature = "otel")]
|
||||
{
|
||||
self.metrics = true;
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the service stack, returning a type-erased cloneable service.
|
||||
///
|
||||
/// Layer order (outermost to innermost):
|
||||
/// `Tracing → Metrics → Timeout → ConcurrencyLimit → ExtractionService`
|
||||
pub(crate) fn build(self) -> BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError> {
|
||||
let svc = ExtractionService::new();
|
||||
|
||||
// Apply concurrency limit (innermost optional layer).
|
||||
let svc = match self.concurrency_limit {
|
||||
Some(limit) => ServiceBuilder::new()
|
||||
.concurrency_limit(limit)
|
||||
.service(svc)
|
||||
.boxed_clone(),
|
||||
None => svc.boxed_clone(),
|
||||
};
|
||||
|
||||
// Apply timeout. We wrap inline rather than using Tower's Timeout layer
|
||||
// because Timeout changes the error type to BoxError — we need to keep
|
||||
// KreuzbergError throughout the stack.
|
||||
let svc: BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError> = match self.timeout {
|
||||
Some(duration) => {
|
||||
let timeout_svc = TimeoutService { inner: svc, duration };
|
||||
timeout_svc.boxed_clone()
|
||||
}
|
||||
None => svc,
|
||||
};
|
||||
|
||||
// Apply metrics layer (otel only).
|
||||
#[cfg(feature = "otel")]
|
||||
let svc = if self.metrics {
|
||||
ServiceBuilder::new()
|
||||
.layer(layers::metrics::MetricsLayer::new())
|
||||
.service(svc)
|
||||
.boxed_clone()
|
||||
} else {
|
||||
svc
|
||||
};
|
||||
|
||||
// Apply tracing layer (outermost).
|
||||
if self.tracing {
|
||||
ServiceBuilder::new()
|
||||
.layer(layers::tracing::TracingLayer::new())
|
||||
.service(svc)
|
||||
.boxed_clone()
|
||||
} else {
|
||||
svc
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Timeout wrapper that preserves KreuzbergError
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// A simple timeout wrapper that converts elapsed timeouts to
|
||||
/// [`KreuzbergError::Timeout`] instead of a `BoxError`.
|
||||
#[derive(Clone)]
|
||||
struct TimeoutService {
|
||||
inner: BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError>,
|
||||
duration: Duration,
|
||||
}
|
||||
|
||||
impl Service<ExtractionRequest> for TimeoutService {
|
||||
type Response = ExtractionResult;
|
||||
type Error = KreuzbergError;
|
||||
type Future = Pin<Box<dyn Future<Output = crate::Result<ExtractionResult>> + Send>>;
|
||||
|
||||
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<crate::Result<()>> {
|
||||
self.inner.poll_ready(cx)
|
||||
}
|
||||
|
||||
fn call(&mut self, req: ExtractionRequest) -> Self::Future {
|
||||
let fut = self.inner.call(req);
|
||||
let duration = self.duration;
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
Box::pin(async move {
|
||||
match tokio::time::timeout(duration, fut).await {
|
||||
Ok(result) => result,
|
||||
Err(_elapsed) => Err(KreuzbergError::Timeout {
|
||||
elapsed_ms: start.elapsed().as_millis() as u64,
|
||||
limit_ms: duration.as_millis() as u64,
|
||||
}),
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::core::config::ExtractionConfig;
|
||||
|
||||
#[test]
|
||||
fn builder_new_builds_service() {
|
||||
// Should not panic.
|
||||
let _svc = ExtractionServiceBuilder::new().build();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn builder_with_timeout_does_not_panic() {
|
||||
let _svc = ExtractionServiceBuilder::new()
|
||||
.with_timeout(Duration::from_secs(30))
|
||||
.build();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn builder_with_concurrency_limit_does_not_panic() {
|
||||
let _svc = ExtractionServiceBuilder::new().with_concurrency_limit(4).build();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn builder_service_extracts_text() {
|
||||
let mut svc = ExtractionServiceBuilder::new().build();
|
||||
let req = ExtractionRequest::bytes(
|
||||
b"hello from builder".as_slice(),
|
||||
"text/plain",
|
||||
ExtractionConfig::default(),
|
||||
);
|
||||
let result = svc.call(req).await.expect("extraction should succeed");
|
||||
assert!(result.content.contains("hello from builder"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn builder_with_timeout_extracts_text() {
|
||||
let mut svc = ExtractionServiceBuilder::new()
|
||||
.with_timeout(Duration::from_secs(10))
|
||||
.build();
|
||||
let req = ExtractionRequest::bytes(b"timeout test".as_slice(), "text/plain", ExtractionConfig::default());
|
||||
let result = svc.call(req).await.expect("extraction should succeed within timeout");
|
||||
assert!(result.content.contains("timeout test"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_fires_on_zero_duration() {
|
||||
let mut svc = ExtractionServiceBuilder::new()
|
||||
.with_timeout(Duration::from_nanos(1))
|
||||
.build();
|
||||
let req = ExtractionRequest::bytes(b"hello".as_slice(), "text/plain", ExtractionConfig::default());
|
||||
let result = svc.call(req).await;
|
||||
// With a 1ns timeout, the result is either a success (if extraction
|
||||
// completes before the timeout is checked) or a Timeout error.
|
||||
// Both are acceptable — the key assertion is that it does not panic.
|
||||
match result {
|
||||
Ok(r) => assert!(r.content.contains("hello")),
|
||||
Err(KreuzbergError::Timeout { .. }) => { /* expected timeout */ }
|
||||
Err(other) => panic!("expected Ok or Timeout, got: {:?}", other),
|
||||
}
|
||||
}
|
||||
}
|
||||
127
crates/kreuzberg/src/service/request.rs
Normal file
127
crates/kreuzberg/src/service/request.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
//! Request and response types for the extraction service.
|
||||
|
||||
use crate::core::config::{ExtractionConfig, FileExtractionConfig};
|
||||
use bytes::Bytes;
|
||||
use std::path::PathBuf;
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
/// The source of a document to extract.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ExtractionSource {
|
||||
/// Extract from a filesystem path with an optional MIME type hint.
|
||||
File { path: PathBuf, mime_hint: Option<String> },
|
||||
/// Extract from in-memory bytes with a known MIME type.
|
||||
Bytes { data: Bytes, mime_type: String },
|
||||
}
|
||||
#[cfg_attr(alef, alef(skip))]
|
||||
/// A request to extract content from a single document.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExtractionRequest {
|
||||
/// Where to read the document from.
|
||||
pub source: ExtractionSource,
|
||||
/// Base extraction configuration.
|
||||
pub config: ExtractionConfig,
|
||||
/// Optional per-file overrides (merged on top of `config`).
|
||||
pub file_overrides: Option<FileExtractionConfig>,
|
||||
}
|
||||
|
||||
impl ExtractionRequest {
|
||||
/// Create a file-based extraction request.
|
||||
#[cfg(feature = "mcp")]
|
||||
pub(crate) fn file(path: impl Into<PathBuf>, config: ExtractionConfig) -> Self {
|
||||
Self {
|
||||
source: ExtractionSource::File {
|
||||
path: path.into(),
|
||||
mime_hint: None,
|
||||
},
|
||||
config,
|
||||
file_overrides: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a file-based extraction request with a MIME type hint.
|
||||
#[cfg(feature = "mcp")]
|
||||
pub(crate) fn file_with_mime(
|
||||
path: impl Into<PathBuf>,
|
||||
mime_hint: impl Into<String>,
|
||||
config: ExtractionConfig,
|
||||
) -> Self {
|
||||
Self {
|
||||
source: ExtractionSource::File {
|
||||
path: path.into(),
|
||||
mime_hint: Some(mime_hint.into()),
|
||||
},
|
||||
config,
|
||||
file_overrides: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a bytes-based extraction request.
|
||||
pub(crate) fn bytes(data: impl Into<Bytes>, mime_type: impl Into<String>, config: ExtractionConfig) -> Self {
|
||||
Self {
|
||||
source: ExtractionSource::Bytes {
|
||||
data: data.into(),
|
||||
mime_type: mime_type.into(),
|
||||
},
|
||||
config,
|
||||
file_overrides: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set per-file overrides on this request.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn with_overrides(mut self, overrides: FileExtractionConfig) -> Self {
|
||||
self.file_overrides = Some(overrides);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[cfg(feature = "mcp")]
|
||||
#[test]
|
||||
fn file_creates_file_source() {
|
||||
let req = ExtractionRequest::file("/tmp/doc.pdf", ExtractionConfig::default());
|
||||
match &req.source {
|
||||
ExtractionSource::File { path, mime_hint } => {
|
||||
assert_eq!(path, &PathBuf::from("/tmp/doc.pdf"));
|
||||
assert!(mime_hint.is_none());
|
||||
}
|
||||
_ => panic!("expected File source"),
|
||||
}
|
||||
assert!(req.file_overrides.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bytes_creates_bytes_source() {
|
||||
let req = ExtractionRequest::bytes(b"hello".as_slice(), "text/plain", ExtractionConfig::default());
|
||||
match &req.source {
|
||||
ExtractionSource::Bytes { data, mime_type } => {
|
||||
assert_eq!(data.as_ref(), b"hello");
|
||||
assert_eq!(mime_type, "text/plain");
|
||||
}
|
||||
_ => panic!("expected Bytes source"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "mcp")]
|
||||
#[test]
|
||||
fn file_with_mime_sets_hint() {
|
||||
let req = ExtractionRequest::file_with_mime("/tmp/doc.pdf", "application/pdf", ExtractionConfig::default());
|
||||
match &req.source {
|
||||
ExtractionSource::File { mime_hint, .. } => {
|
||||
assert_eq!(mime_hint.as_deref(), Some("application/pdf"));
|
||||
}
|
||||
_ => panic!("expected File source"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "mcp")]
|
||||
#[test]
|
||||
fn with_overrides_sets_file_overrides() {
|
||||
let overrides = FileExtractionConfig::default();
|
||||
let req = ExtractionRequest::file("/tmp/doc.pdf", ExtractionConfig::default()).with_overrides(overrides);
|
||||
assert!(req.file_overrides.is_some());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user