This commit is contained in:
19
docs/snippets/r/plugins/clear_plugins.md
Normal file
19
docs/snippets/r/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Clear all custom OCR backends
|
||||
clear_ocr_backends()
|
||||
cat("OCR backends cleared\n")
|
||||
|
||||
# Clear all custom validators
|
||||
clear_validators()
|
||||
cat("Validators cleared\n")
|
||||
|
||||
# Clear all custom post-processors
|
||||
clear_post_processors()
|
||||
cat("Post-processors cleared\n")
|
||||
|
||||
# Clear all custom document extractors
|
||||
clear_document_extractors()
|
||||
cat("Document extractors cleared\n")
|
||||
```
|
||||
15
docs/snippets/r/plugins/custom_ocr_backend.md
Normal file
15
docs/snippets/r/plugins/custom_ocr_backend.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# List available OCR backends
|
||||
backends <- list_ocr_backends()
|
||||
cat("Available backends:", paste(backends, collapse = ", "), "\n")
|
||||
|
||||
# List registered post-processors
|
||||
processors <- list_post_processors()
|
||||
cat("Post-processors:", paste(processors, collapse = ", "), "\n")
|
||||
|
||||
# Clear all custom registrations
|
||||
clear_post_processors()
|
||||
clear_validators()
|
||||
```
|
||||
29
docs/snippets/r/plugins/embedding_backend.md
Normal file
29
docs/snippets/r/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,29 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Wrap an already-loaded embedder (e.g. an ONNX session) so kreuzberg can
|
||||
# call back into it during chunking and standalone embed requests.
|
||||
my_embedder <- list(
|
||||
name = "my-embedder",
|
||||
version = "1.0.0",
|
||||
dimensions = 768L,
|
||||
embed = function(texts) {
|
||||
# Delegate to the already-loaded host model.
|
||||
lapply(texts, function(.) rep(0.0, 768))
|
||||
}
|
||||
)
|
||||
|
||||
register_embedding_backend(my_embedder)
|
||||
|
||||
config <- list(
|
||||
embedding = list(
|
||||
model = list(type = "plugin", name = "my-embedder"),
|
||||
max_embed_duration_secs = 30L
|
||||
)
|
||||
)
|
||||
|
||||
vectors <- embed_texts(c("Hello, world!", "Second text"), config)
|
||||
cat(sprintf("Generated %d embedding vectors\n", length(vectors)))
|
||||
```
|
||||
20
docs/snippets/r/plugins/extractor_registration.md
Normal file
20
docs/snippets/r/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
custom_extractor <- function(path, mime_type) {
|
||||
content <- sprintf("Extracted from %s (%s)", path, mime_type)
|
||||
return(list(
|
||||
content = content,
|
||||
mime_type = mime_type,
|
||||
pages = 1L
|
||||
))
|
||||
}
|
||||
|
||||
register_document_extractor("custom_format", custom_extractor)
|
||||
|
||||
result <- extract_file_sync("custom_document.xyz", "application/custom", NULL)
|
||||
|
||||
cat(sprintf("Custom extractor result:\n"))
|
||||
cat(sprintf("Content: %s\n", result$content))
|
||||
cat(sprintf("Mime type: %s\n", result$mime_type))
|
||||
```
|
||||
15
docs/snippets/r/plugins/list_plugins.md
Normal file
15
docs/snippets/r/plugins/list_plugins.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
ocr_backends <- list_ocr_backends()
|
||||
cat(sprintf("OCR backends: %s\n", paste(ocr_backends, collapse=", ")))
|
||||
|
||||
validators <- list_validators()
|
||||
cat(sprintf("Validators: %s\n", paste(validators, collapse=", ")))
|
||||
|
||||
post_processors <- list_post_processors()
|
||||
cat(sprintf("Post-processors: %s\n", paste(post_processors, collapse=", ")))
|
||||
|
||||
extractors <- list_document_extractors()
|
||||
cat(sprintf("Document extractors: %s\n", paste(extractors, collapse=", ")))
|
||||
```
|
||||
27
docs/snippets/r/plugins/min_length_validator.md
Normal file
27
docs/snippets/r/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,27 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
min_length_validator <- function(result) {
|
||||
min_length <- 50L
|
||||
if (nchar(result$content) < min_length) {
|
||||
return(list(
|
||||
valid = FALSE,
|
||||
message = sprintf(
|
||||
"Content too short: %d < %d characters",
|
||||
nchar(result$content), min_length
|
||||
)
|
||||
))
|
||||
}
|
||||
return(list(valid = TRUE, message = "Content length validation passed"))
|
||||
}
|
||||
|
||||
register_validator("min_length", min_length_validator)
|
||||
|
||||
config <- ExtractionConfig$default()
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
22
docs/snippets/r/plugins/pdf_metadata_extractor.md
Normal file
22
docs/snippets/r/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
extract_pdf_metadata <- function(result) {
|
||||
processed_result <- result
|
||||
if (!is.null(result$metadata)) {
|
||||
cat(sprintf("PDF Metadata:\n"))
|
||||
for (key in names(result$metadata)) {
|
||||
cat(sprintf(" %s: %s\n", key, result$metadata[[key]]))
|
||||
}
|
||||
}
|
||||
return(processed_result)
|
||||
}
|
||||
|
||||
register_post_processor("pdf_metadata", extract_pdf_metadata)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extraction complete: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
21
docs/snippets/r/plugins/pdf_only_processor.md
Normal file
21
docs/snippets/r/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
pdf_only_processor <- function(result) {
|
||||
# Gate the processor so it only runs for PDF documents.
|
||||
if (is.null(result$mime_type) || result$mime_type != "application/pdf") {
|
||||
return(result)
|
||||
}
|
||||
return(result)
|
||||
}
|
||||
|
||||
register_post_processor("pdf_only", pdf_only_processor)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Processed PDF: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
25
docs/snippets/r/plugins/plugin_extractor.md
Normal file
25
docs/snippets/r/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,25 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
custom_json_extractor <- function(path, mime_type) {
|
||||
raw <- readLines(path, warn = FALSE)
|
||||
parsed <- jsonlite::fromJSON(paste(raw, collapse = "\n"))
|
||||
|
||||
text <- paste(unlist(parsed), collapse = "\n")
|
||||
|
||||
return(list(
|
||||
content = text,
|
||||
mime_type = "application/json",
|
||||
pages = 1L,
|
||||
metadata = list(extractor = "custom-json-extractor")
|
||||
))
|
||||
}
|
||||
|
||||
register_document_extractor("custom-json-extractor", custom_json_extractor)
|
||||
|
||||
result <- extract_file_sync("data.json", "application/json", NULL)
|
||||
|
||||
cat(sprintf("Extracted %d characters from JSON\n", nchar(result$content)))
|
||||
```
|
||||
30
docs/snippets/r/plugins/plugin_logging.md
Normal file
30
docs/snippets/r/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,30 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
logging_processor <- function(result) {
|
||||
message(sprintf(
|
||||
"[plugin] processing mime=%s content_chars=%d",
|
||||
result$mime_type %||% "unknown", nchar(result$content)
|
||||
))
|
||||
return(result)
|
||||
}
|
||||
|
||||
logging_validator <- function(result) {
|
||||
message(sprintf(
|
||||
"[plugin] validating mime=%s",
|
||||
result$mime_type %||% "unknown"
|
||||
))
|
||||
return(list(valid = TRUE, message = "ok"))
|
||||
}
|
||||
|
||||
register_post_processor("logging_processor", logging_processor)
|
||||
register_validator("logging_validator", logging_validator)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Done: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
34
docs/snippets/r/plugins/plugin_testing.md
Normal file
34
docs/snippets/r/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,34 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
library(testthat)
|
||||
|
||||
uppercase_processor <- function(result) {
|
||||
result$content <- toupper(result$content)
|
||||
return(result)
|
||||
}
|
||||
|
||||
test_that("uppercase processor uppercases content", {
|
||||
fake_result <- list(
|
||||
content = "hello world",
|
||||
mime_type = "text/plain",
|
||||
metadata = list()
|
||||
)
|
||||
processed <- uppercase_processor(fake_result)
|
||||
expect_equal(processed$content, "HELLO WORLD")
|
||||
})
|
||||
|
||||
test_that("post processor registers and runs", {
|
||||
register_post_processor("uppercase", uppercase_processor)
|
||||
on.exit(unregister_post_processor("uppercase"), add = TRUE)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_bytes_sync(
|
||||
charToRaw("hello world"), "text/plain", config
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
expect_match(result$content, "HELLO WORLD", fixed = TRUE)
|
||||
})
|
||||
```
|
||||
23
docs/snippets/r/plugins/plugin_validator.md
Normal file
23
docs/snippets/r/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
min_content_validator <- function(result) {
|
||||
min_length <- 100L
|
||||
if (nchar(result$content) < min_length) {
|
||||
return(list(
|
||||
valid = FALSE,
|
||||
message = sprintf("Content too short: %d < %d",
|
||||
nchar(result$content), min_length)
|
||||
))
|
||||
}
|
||||
return(list(valid = TRUE, message = "Content validation passed"))
|
||||
}
|
||||
|
||||
register_validator("min_content", min_content_validator)
|
||||
|
||||
config <- ExtractionConfig$default()
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
29
docs/snippets/r/plugins/quality_score_validator.md
Normal file
29
docs/snippets/r/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,29 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
quality_score_validator <- function(result) {
|
||||
min_score <- 0.5
|
||||
score <- as.numeric(result$metadata$quality_score %||% 0)
|
||||
|
||||
if (score < min_score) {
|
||||
return(list(
|
||||
valid = FALSE,
|
||||
message = sprintf(
|
||||
"Quality score too low: %.2f < %.2f",
|
||||
score, min_score
|
||||
)
|
||||
))
|
||||
}
|
||||
return(list(valid = TRUE, message = "Quality score validation passed"))
|
||||
}
|
||||
|
||||
register_validator("quality_score", quality_score_validator)
|
||||
|
||||
config <- ExtractionConfig$default()
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Validated extraction: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
27
docs/snippets/r/plugins/stateful_plugin.md
Normal file
27
docs/snippets/r/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,27 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Encapsulate mutable counter state in an environment so the plugin function
|
||||
# can update it across calls.
|
||||
make_stateful_plugin <- function() {
|
||||
state <- new.env(parent = emptyenv())
|
||||
state$count <- 0L
|
||||
|
||||
process <- function(result) {
|
||||
state$count <- state$count + 1L
|
||||
return(result)
|
||||
}
|
||||
|
||||
list(process = process, count = function() state$count)
|
||||
}
|
||||
|
||||
plugin <- make_stateful_plugin()
|
||||
register_post_processor("stateful_counter", plugin$process)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
extract_file_sync("document.pdf", "application/pdf", config)
|
||||
|
||||
cat(sprintf("Processed: %d\n", plugin$count()))
|
||||
```
|
||||
11
docs/snippets/r/plugins/unregister_plugins.md
Normal file
11
docs/snippets/r/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,11 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Remove plugins by their registered name.
|
||||
unregister_post_processor("metadata_enrichment")
|
||||
unregister_validator("min_length")
|
||||
unregister_ocr_backend("custom_ocr_backend")
|
||||
unregister_document_extractor("custom_format")
|
||||
```
|
||||
20
docs/snippets/r/plugins/word_count_processor.md
Normal file
20
docs/snippets/r/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
word_count_processor <- function(result) {
|
||||
word_count <- length(strsplit(result$content, "\\s+")[[1]])
|
||||
|
||||
result$metadata <- c(result$metadata, list(word_count = word_count))
|
||||
return(result)
|
||||
}
|
||||
|
||||
register_post_processor("word_count", word_count_processor)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Word count: %d\n", result$metadata$word_count))
|
||||
```
|
||||
Reference in New Issue
Block a user