Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
```r title="R"
library(kreuzberg)
# Clear all custom OCR backends
clear_ocr_backends()
cat("OCR backends cleared\n")
# Clear all custom validators
clear_validators()
cat("Validators cleared\n")
# Clear all custom post-processors
clear_post_processors()
cat("Post-processors cleared\n")
# Clear all custom document extractors
clear_document_extractors()
cat("Document extractors cleared\n")
```

View File

@@ -0,0 +1,15 @@
```r
library(kreuzberg)
# List available OCR backends
backends <- list_ocr_backends()
cat("Available backends:", paste(backends, collapse = ", "), "\n")
# List registered post-processors
processors <- list_post_processors()
cat("Post-processors:", paste(processors, collapse = ", "), "\n")
# Clear all custom registrations
clear_post_processors()
clear_validators()
```

View File

@@ -0,0 +1,29 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
# Wrap an already-loaded embedder (e.g. an ONNX session) so kreuzberg can
# call back into it during chunking and standalone embed requests.
my_embedder <- list(
name = "my-embedder",
version = "1.0.0",
dimensions = 768L,
embed = function(texts) {
# Delegate to the already-loaded host model.
lapply(texts, function(.) rep(0.0, 768))
}
)
register_embedding_backend(my_embedder)
config <- list(
embedding = list(
model = list(type = "plugin", name = "my-embedder"),
max_embed_duration_secs = 30L
)
)
vectors <- embed_texts(c("Hello, world!", "Second text"), config)
cat(sprintf("Generated %d embedding vectors\n", length(vectors)))
```

View File

@@ -0,0 +1,20 @@
```r title="R"
library(kreuzberg)
custom_extractor <- function(path, mime_type) {
content <- sprintf("Extracted from %s (%s)", path, mime_type)
return(list(
content = content,
mime_type = mime_type,
pages = 1L
))
}
register_document_extractor("custom_format", custom_extractor)
result <- extract_file_sync("custom_document.xyz", "application/custom", NULL)
cat(sprintf("Custom extractor result:\n"))
cat(sprintf("Content: %s\n", result$content))
cat(sprintf("Mime type: %s\n", result$mime_type))
```

View File

@@ -0,0 +1,15 @@
```r title="R"
library(kreuzberg)
ocr_backends <- list_ocr_backends()
cat(sprintf("OCR backends: %s\n", paste(ocr_backends, collapse=", ")))
validators <- list_validators()
cat(sprintf("Validators: %s\n", paste(validators, collapse=", ")))
post_processors <- list_post_processors()
cat(sprintf("Post-processors: %s\n", paste(post_processors, collapse=", ")))
extractors <- list_document_extractors()
cat(sprintf("Document extractors: %s\n", paste(extractors, collapse=", ")))
```

View File

@@ -0,0 +1,27 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
min_length_validator <- function(result) {
min_length <- 50L
if (nchar(result$content) < min_length) {
return(list(
valid = FALSE,
message = sprintf(
"Content too short: %d < %d characters",
nchar(result$content), min_length
)
))
}
return(list(valid = TRUE, message = "Content length validation passed"))
}
register_validator("min_length", min_length_validator)
config <- ExtractionConfig$default()
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,22 @@
```r title="R"
library(kreuzberg)
extract_pdf_metadata <- function(result) {
processed_result <- result
if (!is.null(result$metadata)) {
cat(sprintf("PDF Metadata:\n"))
for (key in names(result$metadata)) {
cat(sprintf(" %s: %s\n", key, result$metadata[[key]]))
}
}
return(processed_result)
}
register_post_processor("pdf_metadata", extract_pdf_metadata)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extraction complete: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,21 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
pdf_only_processor <- function(result) {
# Gate the processor so it only runs for PDF documents.
if (is.null(result$mime_type) || result$mime_type != "application/pdf") {
return(result)
}
return(result)
}
register_post_processor("pdf_only", pdf_only_processor)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Processed PDF: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,25 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
custom_json_extractor <- function(path, mime_type) {
raw <- readLines(path, warn = FALSE)
parsed <- jsonlite::fromJSON(paste(raw, collapse = "\n"))
text <- paste(unlist(parsed), collapse = "\n")
return(list(
content = text,
mime_type = "application/json",
pages = 1L,
metadata = list(extractor = "custom-json-extractor")
))
}
register_document_extractor("custom-json-extractor", custom_json_extractor)
result <- extract_file_sync("data.json", "application/json", NULL)
cat(sprintf("Extracted %d characters from JSON\n", nchar(result$content)))
```

View File

@@ -0,0 +1,30 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
logging_processor <- function(result) {
message(sprintf(
"[plugin] processing mime=%s content_chars=%d",
result$mime_type %||% "unknown", nchar(result$content)
))
return(result)
}
logging_validator <- function(result) {
message(sprintf(
"[plugin] validating mime=%s",
result$mime_type %||% "unknown"
))
return(list(valid = TRUE, message = "ok"))
}
register_post_processor("logging_processor", logging_processor)
register_validator("logging_validator", logging_validator)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Done: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,34 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
library(testthat)
uppercase_processor <- function(result) {
result$content <- toupper(result$content)
return(result)
}
test_that("uppercase processor uppercases content", {
fake_result <- list(
content = "hello world",
mime_type = "text/plain",
metadata = list()
)
processed <- uppercase_processor(fake_result)
expect_equal(processed$content, "HELLO WORLD")
})
test_that("post processor registers and runs", {
register_post_processor("uppercase", uppercase_processor)
on.exit(unregister_post_processor("uppercase"), add = TRUE)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_bytes_sync(
charToRaw("hello world"), "text/plain", config
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
expect_match(result$content, "HELLO WORLD", fixed = TRUE)
})
```

View File

@@ -0,0 +1,23 @@
```r title="R"
library(kreuzberg)
min_content_validator <- function(result) {
min_length <- 100L
if (nchar(result$content) < min_length) {
return(list(
valid = FALSE,
message = sprintf("Content too short: %d < %d",
nchar(result$content), min_length)
))
}
return(list(valid = TRUE, message = "Content validation passed"))
}
register_validator("min_content", min_content_validator)
config <- ExtractionConfig$default()
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,29 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
quality_score_validator <- function(result) {
min_score <- 0.5
score <- as.numeric(result$metadata$quality_score %||% 0)
if (score < min_score) {
return(list(
valid = FALSE,
message = sprintf(
"Quality score too low: %.2f < %.2f",
score, min_score
)
))
}
return(list(valid = TRUE, message = "Quality score validation passed"))
}
register_validator("quality_score", quality_score_validator)
config <- ExtractionConfig$default()
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Validated extraction: %d characters\n", nchar(result$content)))
```

View File

@@ -0,0 +1,27 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
# Encapsulate mutable counter state in an environment so the plugin function
# can update it across calls.
make_stateful_plugin <- function() {
state <- new.env(parent = emptyenv())
state$count <- 0L
process <- function(result) {
state$count <- state$count + 1L
return(result)
}
list(process = process, count = function() state$count)
}
plugin <- make_stateful_plugin()
register_post_processor("stateful_counter", plugin$process)
config <- list(postprocessor = list(enabled = TRUE))
extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Processed: %d\n", plugin$count()))
```

View File

@@ -0,0 +1,11 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
# Remove plugins by their registered name.
unregister_post_processor("metadata_enrichment")
unregister_validator("min_length")
unregister_ocr_backend("custom_ocr_backend")
unregister_document_extractor("custom_format")
```

View File

@@ -0,0 +1,20 @@
<!-- snippet:syntax-only -->
```r title="R"
library(kreuzberg)
word_count_processor <- function(result) {
word_count <- length(strsplit(result$content, "\\s+")[[1]])
result$metadata <- c(result$metadata, list(word_count = word_count))
return(result)
}
register_post_processor("word_count", word_count_processor)
config <- list(postprocessor = list(enabled = TRUE))
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Word count: %d\n", result$metadata$word_count))
```