This commit is contained in:
18
docs/snippets/r/config/advanced_config.md
Normal file
18
docs/snippets/r/config/advanced_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
ocr = list(backend = "tesseract", language = "eng"),
|
||||
chunking = list(max_characters = 1500L, overlap = 300L),
|
||||
output_format = "markdown",
|
||||
include_document_structure = TRUE,
|
||||
force_ocr = TRUE
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Format: %s\n", result$mime_type))
|
||||
cat(sprintf("Chunks: %d\n", length(result$chunks)))
|
||||
cat(sprintf("Content preview: %.50s...\n", result$content))
|
||||
```
|
||||
50
docs/snippets/r/config/chunking_config.md
Normal file
50
docs/snippets/r/config/chunking_config.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Example 1: Basic character-based chunking
|
||||
config <- list(
|
||||
chunking = list(max_characters = 1000L, overlap = 200L)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
num_chunks <- length(result$chunks)
|
||||
cat(sprintf("Document split into %d chunks\n", num_chunks))
|
||||
for (i in seq_len(min(3L, num_chunks))) {
|
||||
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
|
||||
}
|
||||
```
|
||||
|
||||
```r title="R - Markdown chunker with token-based sizing"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
chunker_type = "markdown",
|
||||
sizing = list(
|
||||
type = "tokenizer",
|
||||
model = "Xenova/gpt-4o"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.md", "text/markdown", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Markdown document split into %d chunks\n", length(result$chunks)))
|
||||
```
|
||||
|
||||
```r title="R - Prepend heading context"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
chunker_type = "markdown",
|
||||
prepend_heading_context = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.md", "text/markdown", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Document split into %d chunks with prepended headings\n", length(result$chunks)))
|
||||
```
|
||||
33
docs/snippets/r/config/chunking_configuration.md
Normal file
33
docs/snippets/r/config/chunking_configuration.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure text chunking for RAG pipelines
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 1000L,
|
||||
overlap = 200L
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("large_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat("Number of chunks:", length(result$chunks), "\n")
|
||||
```
|
||||
|
||||
```r title="R - Prepend Heading Context"
|
||||
library(kreuzberg)
|
||||
|
||||
# Prepend heading context to chunk content for structured documents
|
||||
config <- list(
|
||||
chunking = list(
|
||||
chunker_type = "markdown",
|
||||
max_characters = 500L,
|
||||
overlap = 50L,
|
||||
prepend_heading_context = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.md", "text/markdown", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat("Number of chunks:", length(result$chunks), "\n")
|
||||
```
|
||||
15
docs/snippets/r/config/config_basic.md
Normal file
15
docs/snippets/r/config/config_basic.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
cat("Content preview:\n")
|
||||
cat(substr(result$content, 1, 200))
|
||||
```
|
||||
11
docs/snippets/r/config/config_discover.md
Normal file
11
docs/snippets/r/config/config_discover.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Load configuration from a JSON file and pass it to extract_file_sync.
|
||||
config_json <- paste(readLines("kreuzberg.json"), collapse = "\n")
|
||||
config <- ExtractionConfig$from_json(config_json)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
|
||||
```
|
||||
14
docs/snippets/r/config/config_ocr.md
Normal file
14
docs/snippets/r/config/config_ocr.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng")
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted content length: %d\n", nchar(result$content)))
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
```
|
||||
20
docs/snippets/r/config/config_programmatic.md
Normal file
20
docs/snippets/r/config/config_programmatic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng"
|
||||
),
|
||||
chunking = list(
|
||||
max_characters = 2000L,
|
||||
overlap = 300L
|
||||
),
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("MIME type: %s\n", result$mime_type))
|
||||
```
|
||||
21
docs/snippets/r/config/document_structure_config.md
Normal file
21
docs/snippets/r/config/document_structure_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
include_document_structure = TRUE,
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total pages: %d\n", length(result$pages)))
|
||||
cat(sprintf("MIME type: %s\n\n", result$mime_type))
|
||||
|
||||
for (i in seq_along(result$pages)) {
|
||||
page <- result$pages[[i]]
|
||||
cat(sprintf("Page %d structure:\n", i))
|
||||
cat(sprintf(" Content: %s\n", substr(page$content, 1, 100)))
|
||||
cat("\n")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/config/element_based_output.md
Normal file
20
docs/snippets/r/config/element_based_output.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
result_format = "element_based",
|
||||
output_format = "markdown"
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Total elements: %d\n\n", length(result$elements)))
|
||||
|
||||
for (i in seq_along(result$elements)) {
|
||||
element <- result$elements[[i]]
|
||||
cat(sprintf("Element %d:\n", i))
|
||||
cat(sprintf(" Type: %s\n", element$element_type))
|
||||
cat(sprintf(" Content: %s\n\n", substr(element$content, 1, 100)))
|
||||
}
|
||||
```
|
||||
20
docs/snippets/r/config/embedding_config.md
Normal file
20
docs/snippets/r/config/embedding_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
chunking = list(
|
||||
max_characters = 1000L,
|
||||
overlap = 200L,
|
||||
embedding = list(
|
||||
model = list(type = "preset", name = "balanced"),
|
||||
batch_size = 16L,
|
||||
normalize = TRUE,
|
||||
show_download_progress = TRUE
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Chunks with embeddings: %d\n", length(result$chunks)))
|
||||
```
|
||||
15
docs/snippets/r/config/html_output.md
Normal file
15
docs/snippets/r/config/html_output.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
output_format = "html",
|
||||
html_output = list(
|
||||
theme = "git_hub",
|
||||
embed_css = TRUE
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(result$content) # HTML with kb-* classes
|
||||
```
|
||||
17
docs/snippets/r/config/keyword_extraction_config.md
Normal file
17
docs/snippets/r/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
keywords = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Extracted %d keywords\n", length(result$keywords)))
|
||||
if (length(result$keywords) > 0) {
|
||||
for (i in seq_len(min(5L, length(result$keywords)))) {
|
||||
cat(sprintf(" - %s\n", result$keywords[[i]]))
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/r/config/language_detection_config.md
Normal file
13
docs/snippets/r/config/language_detection_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
language_detection = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
cat(sprintf("Content preview: %.60s...\n", result$content))
|
||||
```
|
||||
16
docs/snippets/r/config/ocr_configuration.md
Normal file
16
docs/snippets/r/config/ocr_configuration.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r
|
||||
library(kreuzberg)
|
||||
|
||||
# Configure OCR with Tesseract
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng+deu"
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("scanned_document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(result$content)
|
||||
```
|
||||
16
docs/snippets/r/config/ocr_dpi_config.md
Normal file
16
docs/snippets/r/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
# Tesseract OCR via the kreuzberg R bindings does not expose a DPI setting in
|
||||
# the high-level config; PDF rasterization DPI is determined by the pipeline.
|
||||
# This example demonstrates running Tesseract OCR end-to-end on a PDF.
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(backend = "tesseract", language = "eng")
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Characters extracted: %d\n", nchar(result$content)))
|
||||
```
|
||||
13
docs/snippets/r/config/pdf_config.md
Normal file
13
docs/snippets/r/config/pdf_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
pdf_options = list(extract_images = TRUE, extract_metadata = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Tables extracted: %d\n", length(result$tables)))
|
||||
cat(sprintf("Content preview: %.50s...\n", result$content))
|
||||
```
|
||||
19
docs/snippets/r/config/pdf_hierarchy_config.md
Normal file
19
docs/snippets/r/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
pdf_options = list(
|
||||
extract_metadata = TRUE,
|
||||
hierarchy = list(
|
||||
enabled = TRUE,
|
||||
k_clusters = 6L,
|
||||
include_bbox = TRUE,
|
||||
ocr_coverage_threshold = 0.8
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
cat(sprintf("Pages: %d\n", length(result$pages)))
|
||||
```
|
||||
13
docs/snippets/r/config/postprocessor_config.md
Normal file
13
docs/snippets/r/config/postprocessor_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
postprocessor = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Mime type: %s\n", result$mime_type))
|
||||
```
|
||||
11
docs/snippets/r/config/quality_processing_config.md
Normal file
11
docs/snippets/r/config/quality_processing_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(enable_quality_processing = TRUE)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Quality score: %.2f\n", result$quality_score))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
17
docs/snippets/r/config/tesseract_config.md
Normal file
17
docs/snippets/r/config/tesseract_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
force_ocr = TRUE,
|
||||
ocr = list(
|
||||
backend = "tesseract",
|
||||
language = "eng+deu"
|
||||
)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Detected language: %s\n", result$detected_language))
|
||||
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
|
||||
```
|
||||
13
docs/snippets/r/config/token_reduction_config.md
Normal file
13
docs/snippets/r/config/token_reduction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```r title="R"
|
||||
library(kreuzberg)
|
||||
|
||||
config <- list(
|
||||
token_reduction = list(enabled = TRUE)
|
||||
)
|
||||
|
||||
json <- extract_file_sync("document.pdf", "application/pdf", config)
|
||||
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
|
||||
|
||||
cat(sprintf("Original content length: %d characters\n", nchar(result$content)))
|
||||
cat(sprintf("Content preview: %.60s...\n", result$content))
|
||||
```
|
||||
Reference in New Issue
Block a user