This commit is contained in:
13
docs/snippets/r/advanced/batch_extraction.md
Normal file
13
docs/snippets/r/advanced/batch_extraction.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# Batch extract from multiple files
|
||||
files <- c("report.pdf", "slides.pptx", "data.xlsx")
|
||||
results <- batch_extract_files_sync(files)
|
||||
|
||||
for (i in seq_along(results)) {
|
||||
cat(sprintf("File: %s\n", files[i]))
|
||||
cat(sprintf(" MIME: %s\n", results[[i]]$mime_type))
|
||||
cat(sprintf(" Length: %d chars\n\n", nchar(results[[i]]$content)))
|
||||
}
|
||||
```
|
||||
27
docs/snippets/r/advanced/chunk_page_mapping.md
Normal file
27
docs/snippets/r/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 500L, overlap = 50L),
|
||||
pages = list(extract_pages = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_along(result$chunks)) {
|
||||
chunk <- result$chunks[[i]]
|
||||
metadata <- result$chunk_metadata[[i]]
|
||||
|
||||
if (!is.null(metadata$first_page) && !is.null(metadata$last_page)) {
|
||||
page_range <- if (metadata$first_page == metadata$last_page) {
|
||||
sprintf("Page %d", metadata$first_page)
|
||||
} else {
|
||||
sprintf("Pages %d-%d", metadata$first_page, metadata$last_page)
|
||||
}
|
||||
|
||||
preview <- substr(chunk, 1L, min(50L, nchar(chunk)))
|
||||
cat(sprintf("Chunk: %s... (%s)\n", preview, page_range))
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/r/advanced/chunking_config.md
Normal file
37
docs/snippets/r/advanced/chunking_config.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Chunks produced: %d\n", length(result$chunks)))
|
||||
for (i in seq_len(min(3L, length(result$chunks)))) {
|
||||
cat(sprintf("Chunk %d length: %d characters\n", i, nchar(result$chunks[[i]])))
|
||||
}
|
||||
```
|
||||
|
||||
```r title="R - Prepend Heading Context"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 500L,
|
||||
overlap = 50L,
|
||||
chunker_type = "markdown",
|
||||
prepend_heading_context = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.md", "text/markdown", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_len(min(3L, length(result$chunks)))) {
|
||||
chunk <- result$chunks[[i]]
|
||||
preview <- substr(chunk, 1L, min(100L, nchar(chunk)))
|
||||
cat(sprintf("%s\n", preview))
|
||||
}
|
||||
```
|
||||
18
docs/snippets/r/advanced/chunking_rag.md
Normal file
18
docs/snippets/r/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 800L, overlap = 150L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
|
||||
cat("Processing chunks for RAG pipeline:\n")
|
||||
|
||||
for (i in seq_len(min(3L, length(result$chunks)))) {
|
||||
chunk <- result$chunks[[i]]
|
||||
cat(sprintf("Chunk %d: %d characters\n", i, nchar(chunk)))
|
||||
}
|
||||
```
|
||||
23
docs/snippets/r/advanced/embedding_with_chunking.md
Normal file
23
docs/snippets/r/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Preparing %d chunks for embedding:\n", length(result$chunks)))
|
||||
|
||||
embeddings_data <- list()
|
||||
for (i in seq_along(result$chunks)) {
|
||||
embeddings_data[[i]] <- list(
|
||||
chunk_id = i,
|
||||
text = result$chunks[[i]],
|
||||
length = nchar(result$chunks[[i]])
|
||||
)
|
||||
}
|
||||
|
||||
cat(sprintf("Ready to embed %d chunks\n", length(embeddings_data)))
|
||||
```
|
||||
18
docs/snippets/r/advanced/keyword_extraction_config.md
Normal file
18
docs/snippets/r/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
keywords = list(
|
||||
algorithm = "yake",
|
||||
max_keywords = 10L,
|
||||
min_score = 0.3,
|
||||
ngram_range = c(1L, 3L),
|
||||
language = "en"
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
|
||||
```
|
||||
19
docs/snippets/r/advanced/keyword_extraction_example.md
Normal file
19
docs/snippets/r/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
keywords = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
|
||||
|
||||
if (length(result$keywords) > 0) {
|
||||
cat("Top keywords:\n")
|
||||
for (i in seq_len(min(10L, length(result$keywords)))) {
|
||||
cat(sprintf(" %d. %s\n", i, result$keywords[[i]]))
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/r/advanced/language_detection_config.md
Normal file
22
docs/snippets/r/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
language_detection = list(
|
||||
enabled = TRUE,
|
||||
min_confidence = 0.8,
|
||||
detect_multiple = FALSE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
if (length(result$detected_languages) > 0) {
|
||||
cat(sprintf("Detected language: %s\n", result$detected_languages[[1]]))
|
||||
} else {
|
||||
cat("No language detected\n")
|
||||
}
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
13
docs/snippets/r/advanced/language_detection_multilingual.md
Normal file
13
docs/snippets/r/advanced/language_detection_multilingual.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
files <- c("english.pdf", "spanish.pdf", "french.pdf")
|
||||
config <- list(language_detection = list(enabled = TRUE))
|
||||
|
||||
for (file in files) {
|
||||
json <- extract_file_sync(file, "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("%s: detected language = %s\n",
|
||||
file, result$detected_language))
|
||||
}
|
||||
```
|
||||
10
docs/snippets/r/advanced/quality_processing_config.md
Normal file
10
docs/snippets/r/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(enable_quality_processing = TRUE)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Quality score: %.2f\n", result$quality_score))
|
||||
```
|
||||
13
docs/snippets/r/advanced/quality_processing_example.md
Normal file
13
docs/snippets/r/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(enable_quality_processing = TRUE)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat("Quality Metrics:\n")
|
||||
cat(sprintf("Quality Score: %.2f\n", result$quality_score))
|
||||
cat(sprintf("Content Length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Pages: %d\n", length(result$pages)))
|
||||
```
|
||||
17
docs/snippets/r/advanced/token_reduction_config.md
Normal file
17
docs/snippets/r/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(
|
||||
mode = "moderate",
|
||||
preserve_markdown = TRUE,
|
||||
preserve_code = TRUE,
|
||||
language_hint = "eng"
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
14
docs/snippets/r/advanced/token_reduction_example.md
Normal file
14
docs/snippets/r/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat("Token-reduced content:\n")
|
||||
cat(sprintf("Length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Preview: %.60s...\n", result$content))
|
||||
```
|
||||
24
docs/snippets/r/advanced/vector_database_integration.md
Normal file
24
docs/snippets/r/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_len(min(3L, length(result$chunks)))) {
|
||||
chunk <- result$chunks[[i]]
|
||||
vector_doc <- list(
|
||||
id = sprintf("doc_%d", i),
|
||||
text = chunk,
|
||||
metadata = list(
|
||||
source = "document.pdf",
|
||||
chunk_index = i,
|
||||
length = nchar(chunk)
|
||||
)
|
||||
)
|
||||
cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
|
||||
}
|
||||
```
|
||||
19
docs/snippets/r/api/batch_extract_bytes_sync.md
Normal file
19
docs/snippets/r/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
paths <- c("report.pdf", "notes.txt")
|
||||
mimes <- c("application/pdf", "text/plain")
|
||||
|
||||
items <- jsonlite::toJSON(lapply(seq_along(paths), function(i) {
|
||||
bytes <- readBin(paths[i], what = "raw", n = file.info(paths[i])$size)
|
||||
list(content = as.integer(bytes), mime_type = mimes[i])
|
||||
}), auto_unbox = TRUE)
|
||||
|
||||
json <- batch_extract_bytes_sync(items = items, config = ExtractionConfig$default())
|
||||
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_along(results)) {
|
||||
cat(sprintf("[%d] mime=%s chars=%d\n",
|
||||
i, results[[i]]$mime_type, nchar(results[[i]]$content)))
|
||||
}
|
||||
```
|
||||
17
docs/snippets/r/api/batch_extract_files_sync.md
Normal file
17
docs/snippets/r/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
items <- jsonlite::toJSON(list(
|
||||
list(path = "report.pdf"),
|
||||
list(path = "slides.pptx"),
|
||||
list(path = "data.xlsx")
|
||||
), auto_unbox = TRUE)
|
||||
|
||||
json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
|
||||
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_along(results)) {
|
||||
cat(sprintf("[%d] mime=%s chars=%d\n",
|
||||
i, results[[i]]$mime_type, nchar(results[[i]]$content)))
|
||||
}
|
||||
```
|
||||
29
docs/snippets/r/api/client_chunk_text.md
Normal file
29
docs/snippets/r/api/client_chunk_text.md
Normal file
@@ -0,0 +1,29 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
library(httr2)
|
||||
|
||||
payload <- list(
|
||||
text = "Your long text content here...",
|
||||
chunker_type = "text",
|
||||
config = list(
|
||||
max_characters = 1000,
|
||||
overlap = 50,
|
||||
trim = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
response <- request("http://localhost:8000/chunk") |>
|
||||
req_method("POST") |>
|
||||
req_body_json(payload) |>
|
||||
req_perform()
|
||||
|
||||
result <- resp_body_json(response)
|
||||
|
||||
cat(sprintf("Created %d chunks\n", result$chunk_count))
|
||||
for (chunk in result$chunks) {
|
||||
preview <- substr(chunk$content, 1, 50)
|
||||
cat(sprintf("Chunk %d: %s...\n", chunk$chunk_index, preview))
|
||||
}
|
||||
```
|
||||
18
docs/snippets/r/api/client_extract_single_file.md
Normal file
18
docs/snippets/r/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,18 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
library(httr2)
|
||||
|
||||
response <- request("http://localhost:8000/extract") |>
|
||||
req_method("POST") |>
|
||||
req_multipart_part(
|
||||
name = "files",
|
||||
path = "document.pdf",
|
||||
type = "application/pdf"
|
||||
) |>
|
||||
req_perform()
|
||||
|
||||
data <- resp_body_json(response)
|
||||
cat(jsonlite::toJSON(data, auto_unbox = TRUE, pretty = TRUE))
|
||||
```
|
||||
34
docs/snippets/r/api/combining_all_features.md
Normal file
34
docs/snippets/r/api/combining_all_features.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config_json <- jsonlite::toJSON(list(
|
||||
output_format = "markdown",
|
||||
force_ocr = TRUE,
|
||||
extract_tables = TRUE,
|
||||
extract_metadata = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng",
|
||||
dpi = 300L
|
||||
),
|
||||
chunking = list(
|
||||
chunker_type = "markdown",
|
||||
max_characters = 1000L,
|
||||
overlap = 200L
|
||||
)
|
||||
), auto_unbox = TRUE)
|
||||
|
||||
config <- ExtractionConfig$from_json(config_json)
|
||||
|
||||
json <- extract_file_sync(
|
||||
path = "scanned_report.pdf",
|
||||
mime_type = "application/pdf",
|
||||
config = config
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Chunks: %d\n", length(result$chunks)))
|
||||
cat(sprintf("Tables: %d\n", length(result$tables)))
|
||||
title <- if (!is.null(result$metadata$title)) result$metadata$title else "<none>"
|
||||
cat(sprintf("Title: %s\n", title))
|
||||
```
|
||||
26
docs/snippets/r/api/error_handling.md
Normal file
26
docs/snippets/r/api/error_handling.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
content <- charToRaw("Hello, world!")
|
||||
|
||||
result <- tryCatch(
|
||||
{
|
||||
json <- extract_bytes_sync(
|
||||
content = content,
|
||||
mime_type = "application/x-nonexistent",
|
||||
config = ExtractionConfig$default()
|
||||
)
|
||||
jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
},
|
||||
error = function(e) {
|
||||
message(sprintf("Extraction failed: %s", conditionMessage(e)))
|
||||
NULL
|
||||
}
|
||||
)
|
||||
|
||||
if (is.null(result)) {
|
||||
cat("No content extracted; falling back to original bytes\n")
|
||||
} else {
|
||||
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
|
||||
}
|
||||
```
|
||||
35
docs/snippets/r/api/error_handling_extract.md
Normal file
35
docs/snippets/r/api/error_handling_extract.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
items <- jsonlite::toJSON(list(
|
||||
list(path = "doc1.pdf"),
|
||||
list(path = "doc2.docx"),
|
||||
list(path = "missing.html")
|
||||
), auto_unbox = TRUE)
|
||||
|
||||
result <- tryCatch(
|
||||
{
|
||||
json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
|
||||
jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
},
|
||||
error = function(e) {
|
||||
message(sprintf("Batch extraction failed: %s", conditionMessage(e)))
|
||||
NULL
|
||||
}
|
||||
)
|
||||
|
||||
if (is.null(result)) {
|
||||
cat("No results returned\n")
|
||||
} else {
|
||||
for (i in seq_along(result)) {
|
||||
item <- result[[i]]
|
||||
err <- item$metadata$error
|
||||
if (!is.null(err)) {
|
||||
cat(sprintf("Document %d: ERROR - %s\n", i, err))
|
||||
} else {
|
||||
cat(sprintf("Document %d: %d chars, %d tables\n",
|
||||
i, nchar(item$content), length(item$tables)))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/r/api/extract_bytes_async.md
Normal file
18
docs/snippets/r/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# extract_bytes is the async variant; the call blocks the calling R thread
|
||||
# until the underlying tokio task completes. Use future/promises if you need
|
||||
# to fan out without blocking.
|
||||
path <- "document.pdf"
|
||||
content <- readBin(path, what = "raw", n = file.info(path)$size)
|
||||
|
||||
json <- extract_bytes(
|
||||
content = content,
|
||||
mime_type = "application/pdf",
|
||||
config = ExtractionConfig$default()
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
|
||||
```
|
||||
16
docs/snippets/r/api/extract_bytes_sync.md
Normal file
16
docs/snippets/r/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
path <- "document.pdf"
|
||||
content <- readBin(path, what = "raw", n = file.info(path)$size)
|
||||
|
||||
json <- extract_bytes_sync(
|
||||
content = content,
|
||||
mime_type = "application/pdf",
|
||||
config = ExtractionConfig$default()
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
cat(sprintf("Content preview: %s\n", substr(result$content, 1, 200)))
|
||||
```
|
||||
15
docs/snippets/r/api/extract_file_async.md
Normal file
15
docs/snippets/r/api/extract_file_async.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# extract_file is the async variant; extendr drives the tokio runtime so the
|
||||
# call returns once extraction completes. R has no native async, so wrap with
|
||||
# the future/promises packages if non-blocking dispatch is required.
|
||||
json <- extract_file(
|
||||
path = "document.pdf",
|
||||
mime_type = "application/pdf",
|
||||
config = ExtractionConfig$default()
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted %d characters from %s\n", nchar(result$content), result$mime_type))
|
||||
```
|
||||
13
docs/snippets/r/api/extract_file_sync.md
Normal file
13
docs/snippets/r/api/extract_file_sync.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
json <- extract_file_sync(
|
||||
path = "document.pdf",
|
||||
mime_type = "application/pdf",
|
||||
config = ExtractionConfig$default()
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
18
docs/snippets/r/config/advanced_config.md
Normal file
18
docs/snippets/r/config/advanced_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
ocr = list(backend = "tesseract", language = "eng"),
|
||||
chunking = list(max_characters = 1500L, overlap = 300L),
|
||||
output_format = "markdown",
|
||||
include_document_structure = TRUE,
|
||||
force_ocr = TRUE
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Format: %s\n", result$mime_type))
|
||||
cat(sprintf("Chunks: %d\n", length(result$chunks)))
|
||||
cat(sprintf("Content preview: %.50s...\n", result$content))
|
||||
```
|
||||
50
docs/snippets/r/config/chunking_config.md
Normal file
50
docs/snippets/r/config/chunking_config.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Example 1: Basic character-based chunking
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
num_chunks <- length(result$chunks)
|
||||
cat(sprintf("Document split into %d chunks\n", num_chunks))
|
||||
for (i in seq_len(min(3L, num_chunks))) {
|
||||
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
|
||||
}
|
||||
```
|
||||
|
||||
```r title="R - Markdown chunker with token-based sizing"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
chunker_type = "markdown",
|
||||
sizing = list(
|
||||
type = "tokenizer",
|
||||
model = "Xenova/gpt-4o"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.md", "text/markdown", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Markdown document split into %d chunks\n", length(result$chunks)))
|
||||
```
|
||||
|
||||
```r title="R - Prepend heading context"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
chunker_type = "markdown",
|
||||
prepend_heading_context = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.md", "text/markdown", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Document split into %d chunks with prepended headings\n", length(result$chunks)))
|
||||
```
|
||||
33
docs/snippets/r/config/chunking_configuration.md
Normal file
33
docs/snippets/r/config/chunking_configuration.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure text chunking for RAG pipelines
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 1000L,
|
||||
overlap = 200L
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("large_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat("Number of chunks:", length(result$chunks), "\n")
|
||||
```
|
||||
|
||||
```r title="R - Prepend Heading Context"
|
||||
library(kreuzberg)
|
||||
|
||||
# Prepend heading context to chunk content for structured documents
|
||||
config <- list(
|
||||
chunking = list(
|
||||
chunker_type = "markdown",
|
||||
max_characters = 500L,
|
||||
overlap = 50L,
|
||||
prepend_heading_context = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.md", "text/markdown", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat("Number of chunks:", length(result$chunks), "\n")
|
||||
```
|
||||
15
docs/snippets/r/config/config_basic.md
Normal file
15
docs/snippets/r/config/config_basic.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
cat("Content preview:\n")
|
||||
cat(substr(result$content, 1, 200))
|
||||
```
|
||||
11
docs/snippets/r/config/config_discover.md
Normal file
11
docs/snippets/r/config/config_discover.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Load configuration from a JSON file and pass it to extract_file_sync.
|
||||
config_json <- paste(readLines("kreuzberg.json"), collapse = "\n")
|
||||
config <- ExtractionConfig$from_json(config_json)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
|
||||
```
|
||||
14
docs/snippets/r/config/config_ocr.md
Normal file
14
docs/snippets/r/config/config_ocr.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng")
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted content length: %d\n", nchar(result$content)))
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
```
|
||||
20
docs/snippets/r/config/config_programmatic.md
Normal file
20
docs/snippets/r/config/config_programmatic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng"
|
||||
),
|
||||
chunking = list(
|
||||
max_characters = 2000L,
|
||||
overlap = 300L
|
||||
),
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
```
|
||||
21
docs/snippets/r/config/document_structure_config.md
Normal file
21
docs/snippets/r/config/document_structure_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
include_document_structure = TRUE,
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total pages: %d\n", length(result$pages)))
|
||||
cat(sprintf("MIME type: %s\n\n", result$mime_type))
|
||||
|
||||
for (i in seq_along(result$pages)) {
|
||||
page <- result$pages[[i]]
|
||||
cat(sprintf("Page %d structure:\n", i))
|
||||
cat(sprintf(" Content: %s\n", substr(page$content, 1, 100)))
|
||||
cat("\n")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/config/element_based_output.md
Normal file
20
docs/snippets/r/config/element_based_output.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
result_format = "element_based",
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total elements: %d\n\n", length(result$elements)))
|
||||
|
||||
for (i in seq_along(result$elements)) {
|
||||
element <- result$elements[[i]]
|
||||
cat(sprintf("Element %d:\n", i))
|
||||
cat(sprintf(" Type: %s\n", element$element_type))
|
||||
cat(sprintf(" Content: %s\n\n", substr(element$content, 1, 100)))
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/config/embedding_config.md
Normal file
20
docs/snippets/r/config/embedding_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 1000L,
|
||||
overlap = 200L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
batch_size = 16L,
|
||||
normalize = TRUE,
|
||||
show_download_progress = TRUE
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
|
||||
```
|
||||
15
docs/snippets/r/config/html_output.md
Normal file
15
docs/snippets/r/config/html_output.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
output_format = "html",
|
||||
html_output = list(
|
||||
theme = "git_hub",
|
||||
embed_css = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(result$content) # HTML with kb-* classes
|
||||
```
|
||||
17
docs/snippets/r/config/keyword_extraction_config.md
Normal file
17
docs/snippets/r/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
keywords = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted %d keywords\n", length(result$keywords)))
|
||||
if (length(result$keywords) > 0) {
|
||||
for (i in seq_len(min(5L, length(result$keywords)))) {
|
||||
cat(sprintf(" - %s\n", result$keywords[[i]]))
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/r/config/language_detection_config.md
Normal file
13
docs/snippets/r/config/language_detection_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
language_detection = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
cat(sprintf("Content preview: %.60s...\n", result$content))
|
||||
```
|
||||
16
docs/snippets/r/config/ocr_configuration.md
Normal file
16
docs/snippets/r/config/ocr_configuration.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure OCR with Tesseract
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng+deu"
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(result$content)
|
||||
```
|
||||
16
docs/snippets/r/config/ocr_dpi_config.md
Normal file
16
docs/snippets/r/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Tesseract OCR via the kreuzberg R bindings does not expose a DPI setting in
|
||||
# the high-level config; PDF rasterization DPI is determined by the pipeline.
|
||||
# This example demonstrates running Tesseract OCR end-to-end on a PDF.
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng")
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Characters extracted: %d\n", nchar(result$content)))
|
||||
```
|
||||
13
docs/snippets/r/config/pdf_config.md
Normal file
13
docs/snippets/r/config/pdf_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
pdf_options = list(extract_images = TRUE, extract_metadata = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Tables extracted: %d\n", length(result$tables)))
|
||||
cat(sprintf("Content preview: %.50s...\n", result$content))
|
||||
```
|
||||
19
docs/snippets/r/config/pdf_hierarchy_config.md
Normal file
19
docs/snippets/r/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
pdf_options = list(
|
||||
extract_metadata = TRUE,
|
||||
hierarchy = list(
|
||||
enabled = TRUE,
|
||||
k_clusters = 6L,
|
||||
include_bbox = TRUE,
|
||||
ocr_coverage_threshold = 0.8
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Pages: %d\n", length(result$pages)))
|
||||
```
|
||||
13
docs/snippets/r/config/postprocessor_config.md
Normal file
13
docs/snippets/r/config/postprocessor_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
postprocessor = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Mime type: %s\n", result$mime_type))
|
||||
```
|
||||
11
docs/snippets/r/config/quality_processing_config.md
Normal file
11
docs/snippets/r/config/quality_processing_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(enable_quality_processing = TRUE)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Quality score: %.2f\n", result$quality_score))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
17
docs/snippets/r/config/tesseract_config.md
Normal file
17
docs/snippets/r/config/tesseract_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng+deu"
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
13
docs/snippets/r/config/token_reduction_config.md
Normal file
13
docs/snippets/r/config/token_reduction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Original content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Content preview: %.60s...\n", result$content))
|
||||
```
|
||||
7
docs/snippets/r/getting-started/basic_extraction.md
Normal file
7
docs/snippets/r/getting-started/basic_extraction.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# Extract text from a PDF file
|
||||
result <- extract_file_sync("document.pdf")
|
||||
cat(result$content)
|
||||
```
|
||||
15
docs/snippets/r/getting-started/basic_usage.md
Normal file
15
docs/snippets/r/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- ExtractionConfig$default()
|
||||
|
||||
json <- extract_file_sync(
|
||||
path = "document.pdf",
|
||||
mime_type = NULL,
|
||||
config = config
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(result$content)
|
||||
cat(sprintf("\nMIME Type: %s\n", result$mime_type))
|
||||
```
|
||||
14
docs/snippets/r/getting-started/extract_file.md
Normal file
14
docs/snippets/r/getting-started/extract_file.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Extract a file and inspect the result
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
# Print result information
|
||||
cat(sprintf("MIME type: %s\n", mime_type(result)))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(content(result))))
|
||||
cat(sprintf("Page count: %d\n", page_count(result)))
|
||||
|
||||
# View additional metadata
|
||||
cat(sprintf("Detected language: %s\n", detected_language(result)))
|
||||
```
|
||||
19
docs/snippets/r/getting-started/extract_with_ocr.md
Normal file
19
docs/snippets/r/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure OCR settings via a plain list mirroring the config JSON.
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng"
|
||||
)
|
||||
)
|
||||
|
||||
# Extract an image file with OCR enabled
|
||||
json <- extract_file_sync("image.png", "image/png", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat("Extracted text from image:\n")
|
||||
cat(result$content)
|
||||
```
|
||||
12
docs/snippets/r/getting-started/hello_world.md
Normal file
12
docs/snippets/r/getting-started/hello_world.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Extract a PDF file
|
||||
result <- extract_file_sync("example.pdf")
|
||||
|
||||
# Print a preview of the extracted content
|
||||
content_preview <- substr(content(result), 1L, 200L)
|
||||
cat("Content preview:\n")
|
||||
cat(content_preview)
|
||||
cat("\n...\n")
|
||||
```
|
||||
7
docs/snippets/r/getting-started/install_verify.md
Normal file
7
docs/snippets/r/getting-started/install_verify.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Confirm the native extension loaded by listing registered extractors
|
||||
extractors <- list_document_extractors()
|
||||
cat(sprintf("kreuzberg ready: %d document extractors registered\n", length(extractors)))
|
||||
```
|
||||
7
docs/snippets/r/getting-started/installation.md
Normal file
7
docs/snippets/r/getting-started/installation.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```r
|
||||
# Install from source (requires Rust toolchain)
|
||||
# install.packages("kreuzberg")
|
||||
|
||||
# Or install from GitHub
|
||||
# remotes::install_github("kreuzberg-dev/kreuzberg", subdir = "packages/r")
|
||||
```
|
||||
20
docs/snippets/r/getting-started/read_content.md
Normal file
20
docs/snippets/r/getting-started/read_content.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Extract a document
|
||||
result <- extract_file_sync("document.docx")
|
||||
|
||||
# Access core content fields
|
||||
cat(sprintf("MIME type: %s\n", mime_type(result)))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(content(result))))
|
||||
|
||||
# Access structured data
|
||||
cat(sprintf("Number of tables: %d\n", length(result$tables)))
|
||||
cat(sprintf("Detected language: %s\n", detected_language(result)))
|
||||
|
||||
# Access metadata
|
||||
author <- metadata_field(result, "author")
|
||||
if (!is.null(author)) {
|
||||
cat(sprintf("Document author: %s\n", author))
|
||||
}
|
||||
```
|
||||
29
docs/snippets/r/llm/structured_extraction.md
Normal file
29
docs/snippets/r/llm/structured_extraction.md
Normal file
@@ -0,0 +1,29 @@
|
||||
<!-- snippet:syntax-only --> Requires network access to the configured LLM provider and a valid API key in the host environment.
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
schema <- list(
|
||||
type = "object",
|
||||
properties = list(
|
||||
title = list(type = "string"),
|
||||
authors = list(type = "array", items = list(type = "string")),
|
||||
date = list(type = "string")
|
||||
),
|
||||
required = c("title", "authors", "date"),
|
||||
additionalProperties = FALSE
|
||||
)
|
||||
|
||||
config <- list(
|
||||
structured_extraction = list(
|
||||
schema = schema,
|
||||
llm = list(model = "openai/gpt-4o-mini"),
|
||||
strict = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("paper.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(result$structured_output, "\n")
|
||||
```
|
||||
25
docs/snippets/r/mcp/mcp_custom_client.md
Normal file
25
docs/snippets/r/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```r title="R"
|
||||
# The kreuzberg R bindings ship no MCP client. Drive the kreuzberg CLI's
|
||||
# stdio MCP transport from R using a piped subprocess.
|
||||
mcp <- pipe("kreuzberg mcp", open = "w+")
|
||||
on.exit(close(mcp), add = TRUE)
|
||||
|
||||
request <- list(
|
||||
method = "tools/call",
|
||||
params = list(
|
||||
name = "extract_file",
|
||||
arguments = list(
|
||||
path = "document.pdf",
|
||||
async = TRUE
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
writeLines(jsonlite::toJSON(request, auto_unbox = TRUE), con = mcp)
|
||||
flush(mcp)
|
||||
|
||||
response_line <- readLines(mcp, n = 1L)
|
||||
cat(response_line, "\n")
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> The R bindings have no MCP client; this snippet drives the MCP CLI over stdio. Requires the `jsonlite` package.
|
||||
11
docs/snippets/r/mcp/mcp_server_start.md
Normal file
11
docs/snippets/r/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```r title="R"
|
||||
# The kreuzberg R bindings do not embed an MCP server: MCP is provided by the
|
||||
# kreuzberg CLI (Rust binary). Spawn it from the same R session that uses the
|
||||
# kreuzberg package for in-process extraction.
|
||||
status <- system2("kreuzberg", args = "mcp", stdout = "", stderr = "")
|
||||
if (status != 0L) {
|
||||
stop(sprintf("MCP server exited with status %d", status))
|
||||
}
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> The R bindings expose extraction primitives only; MCP transport requires the standalone kreuzberg CLI.
|
||||
23
docs/snippets/r/metadata/language_detection.md
Normal file
23
docs/snippets/r/metadata/language_detection.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
cat("Language Detection Results:\n\n")
|
||||
|
||||
cat("Using direct field access:\n")
|
||||
cat("Detected Language:", result$detected_language, "\n\n")
|
||||
|
||||
cat("Using S3 helper function:\n")
|
||||
lang <- detected_language(result)
|
||||
cat("Language (via helper):", lang, "\n\n")
|
||||
|
||||
cat("Language Information:\n")
|
||||
if (lang == "en") {
|
||||
cat("This is an English document\n")
|
||||
} else if (lang == "es") {
|
||||
cat("This is a Spanish document\n")
|
||||
} else {
|
||||
cat(sprintf("This is a %s document\n", lang))
|
||||
}
|
||||
```
|
||||
13
docs/snippets/r/metadata/language_detection_multilingual.md
Normal file
13
docs/snippets/r/metadata/language_detection_multilingual.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
files <- c("english.pdf", "spanish.pdf", "french.pdf")
|
||||
config <- list(language_detection = list(enabled = TRUE))
|
||||
|
||||
for (file in files) {
|
||||
json <- extract_file_sync(file, "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("%s: detected language = %s\n",
|
||||
file, result$detected_language))
|
||||
}
|
||||
```
|
||||
25
docs/snippets/r/metadata/metadata.md
Normal file
25
docs/snippets/r/metadata/metadata.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
cat("Detected Language:", result$detected_language, "\n")
|
||||
cat("Quality Score:", result$quality_score, "\n")
|
||||
cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")
|
||||
|
||||
cat("Metadata fields:\n")
|
||||
authors <- metadata_field(result, "authors")
|
||||
if (!is.null(authors)) {
|
||||
cat("Authors:", paste(authors, collapse=", "), "\n")
|
||||
}
|
||||
|
||||
created <- metadata_field(result, "created_date")
|
||||
if (!is.null(created)) {
|
||||
cat("Created Date:", created, "\n")
|
||||
}
|
||||
|
||||
pages_meta <- metadata_field(result, "page_count")
|
||||
if (!is.null(pages_meta)) {
|
||||
cat("Pages:", pages_meta, "\n")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/r/metadata/page_boundaries.md
Normal file
22
docs/snippets/r/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
boundaries <- result$metadata$pages$boundaries
|
||||
|
||||
if (!is.null(boundaries) && length(boundaries) > 0L) {
|
||||
content_bytes <- charToRaw(result$content)
|
||||
|
||||
for (i in seq_len(min(3L, length(boundaries)))) {
|
||||
boundary <- boundaries[[i]]
|
||||
page_bytes <- content_bytes[(boundary$byte_start + 1L):boundary$byte_end]
|
||||
page_text <- rawToChar(page_bytes)
|
||||
preview_end <- min(100L, nchar(page_text))
|
||||
|
||||
cat(sprintf("Page %d:\n", boundary$page_number))
|
||||
cat(sprintf(" Byte range: %d-%d\n", boundary$byte_start, boundary$byte_end))
|
||||
cat(sprintf(" Preview: %s...\n", substr(page_text, 1L, preview_end)))
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/metadata/page_tracking_basic.md
Normal file
20
docs/snippets/r/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("document.pdf")
|
||||
|
||||
cat("Total pages:", page_count(result), "\n\n")
|
||||
|
||||
for (i in seq_along(result$pages)) {
|
||||
page <- result$pages[[i]]
|
||||
cat(sprintf("Page %d:\n", i))
|
||||
cat(" Elements:", length(page$elements), "\n")
|
||||
cat(" Text content length:", nchar(page$content), "chars\n")
|
||||
|
||||
if (nchar(page$content) > 0L) {
|
||||
preview <- substr(page$content, 1L, 100L)
|
||||
cat(sprintf(" Preview: %s...\n", preview))
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/r/metadata/tables.md
Normal file
22
docs/snippets/r/metadata/tables.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
result <- extract_file_sync("spreadsheet.xlsx")
|
||||
|
||||
cat("Tables extracted:", length(result$tables), "\n\n")
|
||||
|
||||
for (i in seq_along(result$tables)) {
|
||||
table <- result$tables[[i]]
|
||||
cat(sprintf("Table %d:\n", i))
|
||||
cat(" Rows:", nrow(table), "\n")
|
||||
cat(" Columns:", ncol(table), "\n")
|
||||
cat(" Column names:", paste(colnames(table), collapse=", "), "\n")
|
||||
cat("\n")
|
||||
|
||||
if (nrow(table) > 0L) {
|
||||
cat(" Preview (first 3 rows):\n")
|
||||
print(head(table, 3L))
|
||||
cat("\n")
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/r/metadata/vector_database_integration.md
Normal file
24
docs/snippets/r/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_len(min(3L, length(result$chunks)))) {
|
||||
chunk <- result$chunks[[i]]
|
||||
vector_doc <- list(
|
||||
id = sprintf("doc_%d", i),
|
||||
text = chunk,
|
||||
metadata = list(
|
||||
source = "document.pdf",
|
||||
chunk_index = i,
|
||||
length = nchar(chunk)
|
||||
)
|
||||
)
|
||||
cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
|
||||
}
|
||||
```
|
||||
19
docs/snippets/r/ocr/cloud_ocr_backend.md
Normal file
19
docs/snippets/r/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
custom_ocr_backend <- function(image_path, language) {
|
||||
cat(sprintf("Processing image: %s\n", image_path))
|
||||
return(sprintf("Extracted text from %s", image_path))
|
||||
}
|
||||
|
||||
register_ocr_backend("custom_cloud", custom_ocr_backend)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "custom_cloud", language = "en")
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Custom backend result: %d chars\n", nchar(result$content)))
|
||||
```
|
||||
16
docs/snippets/r/ocr/image_extraction.md
Normal file
16
docs/snippets/r/ocr/image_extraction.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng")
|
||||
)
|
||||
|
||||
json <- extract_file_sync("scan.png", "image/png", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat("Image extraction via OCR:\n")
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Mime type: %s\n", result$mime_type))
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
```
|
||||
16
docs/snippets/r/ocr/image_preprocessing.md
Normal file
16
docs/snippets/r/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng"),
|
||||
enable_quality_processing = TRUE
|
||||
)
|
||||
|
||||
json <- extract_file_sync("scan.png", "image/png", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Quality: %.2f, Length: %d\n",
|
||||
result$quality_score %||% 0,
|
||||
nchar(result$content)))
|
||||
```
|
||||
16
docs/snippets/r/ocr/ocr_easyocr.md
Normal file
16
docs/snippets/r/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Note: EasyOCR backend requires Python to be installed
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "easyocr", language = "en")
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat("EasyOCR extraction:\n")
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
```
|
||||
23
docs/snippets/r/ocr/ocr_elements.md
Normal file
23
docs/snippets/r/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Enable structured OCR elements alongside text extraction
|
||||
config <- list(
|
||||
ocr = list(
|
||||
backend = "paddleocr",
|
||||
language = "en",
|
||||
element_config = list(include_elements = TRUE)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("scanned.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
if (!is.null(result$ocr_elements)) {
|
||||
for (element in result$ocr_elements) {
|
||||
cat(sprintf("Text: %s\n", element$text))
|
||||
cat(sprintf("Confidence: %.2f\n", element$confidence$recognition))
|
||||
cat("\n")
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/r/ocr/ocr_extraction.md
Normal file
17
docs/snippets/r/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure Tesseract OCR
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng")
|
||||
)
|
||||
|
||||
# Extract text from a scanned image
|
||||
json <- extract_file_sync("scan.png", "image/png", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
|
||||
cat("Content preview:\n")
|
||||
cat(substr(result$content, 1, 200))
|
||||
```
|
||||
12
docs/snippets/r/ocr/ocr_force_all_pages.md
Normal file
12
docs/snippets/r/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(force_ocr = TRUE)
|
||||
|
||||
json <- extract_file_sync("multipage_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total pages: %d\n", length(result$pages)))
|
||||
cat(sprintf("Content extracted via OCR: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
```
|
||||
18
docs/snippets/r/ocr/ocr_multi_language.md
Normal file
18
docs/snippets/r/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure multi-language OCR (English, French, German)
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng+fra+deu")
|
||||
)
|
||||
|
||||
# Extract from a multilingual document
|
||||
json <- extract_file_sync("multilingual.png", "image/png", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
|
||||
cat("Content preview:\n")
|
||||
cat(substr(result$content, 1, 200))
|
||||
```
|
||||
18
docs/snippets/r/ocr/ocr_paddleocr.md
Normal file
18
docs/snippets/r/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure PaddleOCR backend (defaults to mobile tier)
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "paddle-ocr", language = "en")
|
||||
)
|
||||
|
||||
# Extract text from an image using PaddleOCR
|
||||
json <- extract_file_sync("document.jpg", "image/jpeg", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
cat("Content preview:\n")
|
||||
cat(substr(result$content, 1, 200))
|
||||
```
|
||||
19
docs/snippets/r/plugins/clear_plugins.md
Normal file
19
docs/snippets/r/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Clear all custom OCR backends
|
||||
clear_ocr_backends()
|
||||
cat("OCR backends cleared\n")
|
||||
|
||||
# Clear all custom validators
|
||||
clear_validators()
|
||||
cat("Validators cleared\n")
|
||||
|
||||
# Clear all custom post-processors
|
||||
clear_post_processors()
|
||||
cat("Post-processors cleared\n")
|
||||
|
||||
# Clear all custom document extractors
|
||||
clear_document_extractors()
|
||||
cat("Document extractors cleared\n")
|
||||
```
|
||||
15
docs/snippets/r/plugins/custom_ocr_backend.md
Normal file
15
docs/snippets/r/plugins/custom_ocr_backend.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# List available OCR backends
|
||||
backends <- list_ocr_backends()
|
||||
cat("Available backends:", paste(backends, collapse = ", "), "\n")
|
||||
|
||||
# List registered post-processors
|
||||
processors <- list_post_processors()
|
||||
cat("Post-processors:", paste(processors, collapse = ", "), "\n")
|
||||
|
||||
# Clear all custom registrations
|
||||
clear_post_processors()
|
||||
clear_validators()
|
||||
```
|
||||
29
docs/snippets/r/plugins/embedding_backend.md
Normal file
29
docs/snippets/r/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,29 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Wrap an already-loaded embedder (e.g. an ONNX session) so kreuzberg can
|
||||
# call back into it during chunking and standalone embed requests.
|
||||
my_embedder <- list(
|
||||
name = "my-embedder",
|
||||
version = "1.0.0",
|
||||
dimensions = 768L,
|
||||
embed = function(texts) {
|
||||
# Delegate to the already-loaded host model.
|
||||
lapply(texts, function(.) rep(0.0, 768))
|
||||
}
|
||||
)
|
||||
|
||||
register_embedding_backend(my_embedder)
|
||||
|
||||
config <- list(
|
||||
embedding = list(
|
||||
model = list(type = "plugin", name = "my-embedder"),
|
||||
max_embed_duration_secs = 30L
|
||||
)
|
||||
)
|
||||
|
||||
vectors <- embed_texts(c("Hello, world!", "Second text"), config)
|
||||
cat(sprintf("Generated %d embedding vectors\n", length(vectors)))
|
||||
```
|
||||
20
docs/snippets/r/plugins/extractor_registration.md
Normal file
20
docs/snippets/r/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
custom_extractor <- function(path, mime_type) {
|
||||
content <- sprintf("Extracted from %s (%s)", path, mime_type)
|
||||
return(list(
|
||||
content = content,
|
||||
mime_type = mime_type,
|
||||
pages = 1L
|
||||
))
|
||||
}
|
||||
|
||||
register_document_extractor("custom_format", custom_extractor)
|
||||
|
||||
result <- extract_file_sync("custom_document.xyz", "application/custom", NULL)
|
||||
|
||||
cat(sprintf("Custom extractor result:\n"))
|
||||
cat(sprintf("Content: %s\n", result$content))
|
||||
cat(sprintf("Mime type: %s\n", result$mime_type))
|
||||
```
|
||||
15
docs/snippets/r/plugins/list_plugins.md
Normal file
15
docs/snippets/r/plugins/list_plugins.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
ocr_backends <- list_ocr_backends()
|
||||
cat(sprintf("OCR backends: %s\n", paste(ocr_backends, collapse=", ")))
|
||||
|
||||
validators <- list_validators()
|
||||
cat(sprintf("Validators: %s\n", paste(validators, collapse=", ")))
|
||||
|
||||
post_processors <- list_post_processors()
|
||||
cat(sprintf("Post-processors: %s\n", paste(post_processors, collapse=", ")))
|
||||
|
||||
extractors <- list_document_extractors()
|
||||
cat(sprintf("Document extractors: %s\n", paste(extractors, collapse=", ")))
|
||||
```
|
||||
27
docs/snippets/r/plugins/min_length_validator.md
Normal file
27
docs/snippets/r/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,27 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
min_length_validator <- function(result) {
|
||||
min_length <- 50L
|
||||
if (nchar(result$content) < min_length) {
|
||||
return(list(
|
||||
valid = FALSE,
|
||||
message = sprintf(
|
||||
"Content too short: %d < %d characters",
|
||||
nchar(result$content), min_length
|
||||
)
|
||||
))
|
||||
}
|
||||
return(list(valid = TRUE, message = "Content length validation passed"))
|
||||
}
|
||||
|
||||
register_validator("min_length", min_length_validator)
|
||||
|
||||
config <- ExtractionConfig$default()
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
22
docs/snippets/r/plugins/pdf_metadata_extractor.md
Normal file
22
docs/snippets/r/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
extract_pdf_metadata <- function(result) {
|
||||
processed_result <- result
|
||||
if (!is.null(result$metadata)) {
|
||||
cat(sprintf("PDF Metadata:\n"))
|
||||
for (key in names(result$metadata)) {
|
||||
cat(sprintf(" %s: %s\n", key, result$metadata[[key]]))
|
||||
}
|
||||
}
|
||||
return(processed_result)
|
||||
}
|
||||
|
||||
register_post_processor("pdf_metadata", extract_pdf_metadata)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extraction complete: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
21
docs/snippets/r/plugins/pdf_only_processor.md
Normal file
21
docs/snippets/r/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
pdf_only_processor <- function(result) {
|
||||
# Gate the processor so it only runs for PDF documents.
|
||||
if (is.null(result$mime_type) || result$mime_type != "application/pdf") {
|
||||
return(result)
|
||||
}
|
||||
return(result)
|
||||
}
|
||||
|
||||
register_post_processor("pdf_only", pdf_only_processor)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Processed PDF: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
25
docs/snippets/r/plugins/plugin_extractor.md
Normal file
25
docs/snippets/r/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,25 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
custom_json_extractor <- function(path, mime_type) {
|
||||
raw <- readLines(path, warn = FALSE)
|
||||
parsed <- jsonlite::fromJSON(paste(raw, collapse = "\n"))
|
||||
|
||||
text <- paste(unlist(parsed), collapse = "\n")
|
||||
|
||||
return(list(
|
||||
content = text,
|
||||
mime_type = "application/json",
|
||||
pages = 1L,
|
||||
metadata = list(extractor = "custom-json-extractor")
|
||||
))
|
||||
}
|
||||
|
||||
register_document_extractor("custom-json-extractor", custom_json_extractor)
|
||||
|
||||
result <- extract_file_sync("data.json", "application/json", NULL)
|
||||
|
||||
cat(sprintf("Extracted %d characters from JSON\n", nchar(result$content)))
|
||||
```
|
||||
30
docs/snippets/r/plugins/plugin_logging.md
Normal file
30
docs/snippets/r/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,30 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
logging_processor <- function(result) {
|
||||
message(sprintf(
|
||||
"[plugin] processing mime=%s content_chars=%d",
|
||||
result$mime_type %||% "unknown", nchar(result$content)
|
||||
))
|
||||
return(result)
|
||||
}
|
||||
|
||||
logging_validator <- function(result) {
|
||||
message(sprintf(
|
||||
"[plugin] validating mime=%s",
|
||||
result$mime_type %||% "unknown"
|
||||
))
|
||||
return(list(valid = TRUE, message = "ok"))
|
||||
}
|
||||
|
||||
register_post_processor("logging_processor", logging_processor)
|
||||
register_validator("logging_validator", logging_validator)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Done: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
34
docs/snippets/r/plugins/plugin_testing.md
Normal file
34
docs/snippets/r/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,34 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
library(testthat)
|
||||
|
||||
uppercase_processor <- function(result) {
|
||||
result$content <- toupper(result$content)
|
||||
return(result)
|
||||
}
|
||||
|
||||
test_that("uppercase processor uppercases content", {
|
||||
fake_result <- list(
|
||||
content = "hello world",
|
||||
mime_type = "text/plain",
|
||||
metadata = list()
|
||||
)
|
||||
processed <- uppercase_processor(fake_result)
|
||||
expect_equal(processed$content, "HELLO WORLD")
|
||||
})
|
||||
|
||||
test_that("post processor registers and runs", {
|
||||
register_post_processor("uppercase", uppercase_processor)
|
||||
on.exit(unregister_post_processor("uppercase"), add = TRUE)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_bytes_sync(
|
||||
charToRaw("hello world"), "text/plain", config
|
||||
)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
expect_match(result$content, "HELLO WORLD", fixed = TRUE)
|
||||
})
|
||||
```
|
||||
23
docs/snippets/r/plugins/plugin_validator.md
Normal file
23
docs/snippets/r/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
min_content_validator <- function(result) {
|
||||
min_length <- 100L
|
||||
if (nchar(result$content) < min_length) {
|
||||
return(list(
|
||||
valid = FALSE,
|
||||
message = sprintf("Content too short: %d < %d",
|
||||
nchar(result$content), min_length)
|
||||
))
|
||||
}
|
||||
return(list(valid = TRUE, message = "Content validation passed"))
|
||||
}
|
||||
|
||||
register_validator("min_content", min_content_validator)
|
||||
|
||||
config <- ExtractionConfig$default()
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
29
docs/snippets/r/plugins/quality_score_validator.md
Normal file
29
docs/snippets/r/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,29 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
quality_score_validator <- function(result) {
|
||||
min_score <- 0.5
|
||||
score <- as.numeric(result$metadata$quality_score %||% 0)
|
||||
|
||||
if (score < min_score) {
|
||||
return(list(
|
||||
valid = FALSE,
|
||||
message = sprintf(
|
||||
"Quality score too low: %.2f < %.2f",
|
||||
score, min_score
|
||||
)
|
||||
))
|
||||
}
|
||||
return(list(valid = TRUE, message = "Quality score validation passed"))
|
||||
}
|
||||
|
||||
register_validator("quality_score", quality_score_validator)
|
||||
|
||||
config <- ExtractionConfig$default()
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Validated extraction: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
27
docs/snippets/r/plugins/stateful_plugin.md
Normal file
27
docs/snippets/r/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,27 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Encapsulate mutable counter state in an environment so the plugin function
|
||||
# can update it across calls.
|
||||
make_stateful_plugin <- function() {
|
||||
state <- new.env(parent = emptyenv())
|
||||
state$count <- 0L
|
||||
|
||||
process <- function(result) {
|
||||
state$count <- state$count + 1L
|
||||
return(result)
|
||||
}
|
||||
|
||||
list(process = process, count = function() state$count)
|
||||
}
|
||||
|
||||
plugin <- make_stateful_plugin()
|
||||
register_post_processor("stateful_counter", plugin$process)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
extract_file_sync("document.pdf", "application/pdf", config)
|
||||
|
||||
cat(sprintf("Processed: %d\n", plugin$count()))
|
||||
```
|
||||
11
docs/snippets/r/plugins/unregister_plugins.md
Normal file
11
docs/snippets/r/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,11 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Remove plugins by their registered name.
|
||||
unregister_post_processor("metadata_enrichment")
|
||||
unregister_validator("min_length")
|
||||
unregister_ocr_backend("custom_ocr_backend")
|
||||
unregister_document_extractor("custom_format")
|
||||
```
|
||||
20
docs/snippets/r/plugins/word_count_processor.md
Normal file
20
docs/snippets/r/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
word_count_processor <- function(result) {
|
||||
word_count <- length(strsplit(result$content, "\\s+")[[1]])
|
||||
|
||||
result$metadata <- c(result$metadata, list(word_count = word_count))
|
||||
return(result)
|
||||
}
|
||||
|
||||
register_post_processor("word_count", word_count_processor)
|
||||
|
||||
config <- list(postprocessor = list(enabled = TRUE))
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Word count: %d\n", result$metadata$word_count))
|
||||
```
|
||||
15
docs/snippets/r/utils/chunking.md
Normal file
15
docs/snippets/r/utils/chunking.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
|
||||
for (i in seq_len(min(5L, length(result$chunks)))) {
|
||||
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
|
||||
}
|
||||
```
|
||||
25
docs/snippets/r/utils/chunking_rag.md
Normal file
25
docs/snippets/r/utils/chunking_rag.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 500L,
|
||||
overlap = 50L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
for (i in seq_along(result$chunks)) {
|
||||
chunk <- result$chunks[[i]]
|
||||
cat(sprintf("Chunk %d/%d\n", i, length(result$chunks)))
|
||||
if (!is.null(chunk$embedding)) {
|
||||
cat(sprintf(" Embedding: %d dimensions\n", length(chunk$embedding)))
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/utils/embedding_with_chunking.md
Normal file
20
docs/snippets/r/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 1024L,
|
||||
overlap = 100L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE,
|
||||
batch_size = 32L
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
|
||||
```
|
||||
21
docs/snippets/r/utils/keyword_extraction_example.md
Normal file
21
docs/snippets/r/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
keywords = list(
|
||||
algorithm = "yake",
|
||||
max_keywords = 10L,
|
||||
min_score = 0.3
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("research_paper.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
if (!is.null(result$metadata$keywords)) {
|
||||
for (kw in result$metadata$keywords) {
|
||||
cat(sprintf(" - %s\n", kw))
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/r/utils/quality_processing_example.md
Normal file
15
docs/snippets/r/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(enable_quality_processing = TRUE)
|
||||
json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
if (!is.null(result$quality_score)) {
|
||||
cat(sprintf("Quality score: %.2f\n", result$quality_score))
|
||||
if (result$quality_score < 0.5) {
|
||||
cat("Warning: low quality extraction\n")
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/r/utils/standalone_embed.md
Normal file
15
docs/snippets/r/utils/standalone_embed.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE
|
||||
)
|
||||
|
||||
texts <- c("Hello, world!", "Kreuzberg is fast")
|
||||
embeddings <- embed_texts(texts, config)
|
||||
|
||||
stopifnot(length(embeddings) == 2L)
|
||||
cat(sprintf("Embedding 1: %d dimensions\n", length(embeddings[[1]])))
|
||||
cat(sprintf("Embedding 2: %d dimensions\n", length(embeddings[[2]])))
|
||||
```
|
||||
15
docs/snippets/r/utils/token_reduction.md
Normal file
15
docs/snippets/r/utils/token_reduction.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(
|
||||
mode = "moderate",
|
||||
preserve_important_words = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(result$content)
|
||||
```
|
||||
16
docs/snippets/r/utils/token_reduction_example.md
Normal file
16
docs/snippets/r/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(
|
||||
mode = "moderate",
|
||||
preserve_important_words = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("verbose_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Reduced content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
```
|
||||
26
docs/snippets/r/utils/vector_database_integration.md
Normal file
26
docs/snippets/r/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
document_id <- "doc-001"
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 512L,
|
||||
overlap = 50L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
normalize = TRUE,
|
||||
batch_size = 32L
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
# Each chunk has $content, $embedding, and $metadata. Pass these directly
|
||||
# to a vector database client (pgvector, Qdrant, Pinecone, etc.) along with
|
||||
# the document_id stored as a metadata field.
|
||||
cat(sprintf("document_id: %s\n", document_id))
|
||||
cat(sprintf("chunks ready for upsert: %d\n", length(result$chunks)))
|
||||
```
|
||||
Reference in New Issue
Block a user