This commit is contained in:
50
docs/snippets/c/advanced/chunk_page_mapping.md
Normal file
50
docs/snippets/c/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 500,"
|
||||
"\"overlap\": 50"
|
||||
"},"
|
||||
"\"pages\": {"
|
||||
"\"extract_pages\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks (JSON, includes per-chunk first_page/last_page metadata):\n%s\n",
|
||||
chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
char *pages_json = kreuzberg_extraction_result_pages(result);
|
||||
printf("pages (JSON): %s\n", pages_json ? pages_json : "[]");
|
||||
kreuzberg_free_string(pages_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
43
docs/snippets/c/advanced/chunking_config.md
Normal file
43
docs/snippets/c/advanced/chunking_config.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"markdown\","
|
||||
"\"max_characters\": 500,"
|
||||
"\"overlap\": 50,"
|
||||
"\"prepend_heading_context\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks (JSON): %s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
47
docs/snippets/c/advanced/chunking_rag.md
Normal file
47
docs/snippets/c/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 500,"
|
||||
"\"overlap\": 50,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("research_paper.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks (JSON, each item includes content, embedding, and metadata.chunk_index/total_chunks/byte_start/byte_end):\n%s\n",
|
||||
chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
48
docs/snippets/c/advanced/embedding_with_chunking.md
Normal file
48
docs/snippets/c/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 1024,"
|
||||
"\"overlap\": 100,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true,"
|
||||
"\"batch_size\": 32,"
|
||||
"\"show_download_progress\": false"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks with embeddings (JSON):\n%s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/advanced/keyword_extraction_config.md
Normal file
44
docs/snippets/c/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"keywords\": {"
|
||||
"\"algorithm\": \"yake\","
|
||||
"\"max_keywords\": 10,"
|
||||
"\"min_score\": 0.3,"
|
||||
"\"ngram_range\": [1, 3],"
|
||||
"\"language\": \"en\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *keywords_json = kreuzberg_extraction_result_extracted_keywords(result);
|
||||
printf("keywords (JSON): %s\n", keywords_json ? keywords_json : "[]");
|
||||
kreuzberg_free_string(keywords_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
46
docs/snippets/c/advanced/keyword_extraction_example.md
Normal file
46
docs/snippets/c/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"keywords\": {"
|
||||
"\"algorithm\": \"yake\","
|
||||
"\"max_keywords\": 10,"
|
||||
"\"min_score\": 0.3"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("research_paper.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *keywords_json = kreuzberg_extraction_result_extracted_keywords(result);
|
||||
if (keywords_json) {
|
||||
printf("Keywords: %s\n", keywords_json);
|
||||
kreuzberg_free_string(keywords_json);
|
||||
} else {
|
||||
printf("Keywords: (none)\n");
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
42
docs/snippets/c/advanced/language_detection_config.md
Normal file
42
docs/snippets/c/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"language_detection\": {"
|
||||
"\"enabled\": true,"
|
||||
"\"min_confidence\": 0.8,"
|
||||
"\"detect_multiple\": false"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *langs_json = kreuzberg_extraction_result_detected_languages(result);
|
||||
printf("detected languages (JSON): %s\n", langs_json ? langs_json : "[]");
|
||||
kreuzberg_free_string(langs_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
42
docs/snippets/c/advanced/language_detection_multilingual.md
Normal file
42
docs/snippets/c/advanced/language_detection_multilingual.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"language_detection\": {"
|
||||
"\"enabled\": true,"
|
||||
"\"min_confidence\": 0.8,"
|
||||
"\"detect_multiple\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("multilingual_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *langs_json = kreuzberg_extraction_result_detected_languages(result);
|
||||
printf("Detected languages: %s\n", langs_json ? langs_json : "[]");
|
||||
kreuzberg_free_string(langs_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
37
docs/snippets/c/advanced/quality_processing_config.md
Normal file
37
docs/snippets/c/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"enable_quality_processing\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
double score = kreuzberg_extraction_result_quality_score(result);
|
||||
printf("quality score: %.2f\n", score);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
45
docs/snippets/c/advanced/quality_processing_example.md
Normal file
45
docs/snippets/c/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"enable_quality_processing\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("scanned_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
double score = kreuzberg_extraction_result_quality_score(result);
|
||||
if (score < 0.5) {
|
||||
printf("Warning: Low quality extraction (%.2f)\n", score);
|
||||
} else {
|
||||
printf("Quality score: %.2f\n", score);
|
||||
}
|
||||
|
||||
char *warnings_json = kreuzberg_extraction_result_processing_warnings(result);
|
||||
printf("processing warnings (JSON): %s\n", warnings_json ? warnings_json : "[]");
|
||||
kreuzberg_free_string(warnings_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
41
docs/snippets/c/advanced/token_reduction_config.md
Normal file
41
docs/snippets/c/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"token_reduction\": {"
|
||||
"\"mode\": \"moderate\","
|
||||
"\"preserve_important_words\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("reduced content:\n%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/advanced/token_reduction_example.md
Normal file
44
docs/snippets/c/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"token_reduction\": {"
|
||||
"\"mode\": \"moderate\","
|
||||
"\"preserve_important_words\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("verbose_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
if (content) {
|
||||
printf("reduced content (%zu bytes):\n%s\n", strlen(content), content);
|
||||
kreuzberg_free_string(content);
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
55
docs/snippets/c/advanced/vector_database_integration.md
Normal file
55
docs/snippets/c/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *document_path = "document.pdf";
|
||||
const char *document_id = "doc-001";
|
||||
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 512,"
|
||||
"\"overlap\": 50,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true,"
|
||||
"\"batch_size\": 32"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync(document_path, NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* The chunks JSON array carries content + embedding + metadata for each
|
||||
chunk. Pass this directly to your vector database client (pgvector,
|
||||
Qdrant, Pinecone, etc.) along with the document_id as a metadata field. */
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("document_id: %s\n", document_id);
|
||||
printf("chunks (JSON, ready to upsert into a vector DB):\n%s\n",
|
||||
chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
33
docs/snippets/c/api/batch_extract_bytes_sync.md
Normal file
33
docs/snippets/c/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
/* Items is a JSON array of BatchBytesItem objects.
|
||||
* Each entry has "content" (array of byte integers), "mime_type", and an optional "config". */
|
||||
const char *items_json =
|
||||
"["
|
||||
" {\"content\": [72,101,108,108,111,33], \"mime_type\": \"text/plain\"},"
|
||||
" {\"content\": [87,111,114,108,100,33], \"mime_type\": \"text/plain\"}"
|
||||
"]";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
/* Returns a JSON array of ExtractionResult objects, or NULL on failure. */
|
||||
char *results_json =
|
||||
kreuzberg_batch_extract_bytes_sync(items_json, config);
|
||||
if (!results_json) {
|
||||
fprintf(stderr, "batch extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("%s\n", results_json);
|
||||
kreuzberg_free_string(results_json);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
34
docs/snippets/c/api/batch_extract_files_sync.md
Normal file
34
docs/snippets/c/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
/* Items is a JSON array of BatchFileItem objects.
|
||||
* Each entry has a "path" field and an optional "config" override. */
|
||||
const char *items_json =
|
||||
"["
|
||||
" {\"path\": \"doc1.pdf\"},"
|
||||
" {\"path\": \"doc2.docx\"},"
|
||||
" {\"path\": \"scan.png\", \"config\": {\"force_ocr\": true}}"
|
||||
"]";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
/* Returns a JSON array of ExtractionResult objects, or NULL on failure. */
|
||||
char *results_json =
|
||||
kreuzberg_batch_extract_files_sync(items_json, config);
|
||||
if (!results_json) {
|
||||
fprintf(stderr, "batch extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("%s\n", results_json);
|
||||
kreuzberg_free_string(results_json);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
73
docs/snippets/c/api/client_chunk_text.md
Normal file
73
docs/snippets/c/api/client_chunk_text.md
Normal file
@@ -0,0 +1,73 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```c title="C"
|
||||
#include <curl/curl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
struct response_buffer {
|
||||
char *data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static size_t write_callback(void *contents, size_t size, size_t nmemb, void *userp) {
|
||||
size_t total = size * nmemb;
|
||||
struct response_buffer *buf = (struct response_buffer *)userp;
|
||||
char *resized = realloc(buf->data, buf->size + total + 1);
|
||||
if (!resized) {
|
||||
return 0;
|
||||
}
|
||||
buf->data = resized;
|
||||
memcpy(buf->data + buf->size, contents, total);
|
||||
buf->size += total;
|
||||
buf->data[buf->size] = '\0';
|
||||
return total;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
fprintf(stderr, "curl_easy_init failed\n");
|
||||
curl_global_cleanup();
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char *body =
|
||||
"{"
|
||||
"\"text\": \"Lorem ipsum dolor sit amet, consectetur adipiscing elit.\","
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"config\": {\"max_characters\": 256, \"overlap\": 32, \"trim\": true}"
|
||||
"}";
|
||||
|
||||
struct curl_slist *headers = NULL;
|
||||
headers = curl_slist_append(headers, "Content-Type: application/json");
|
||||
headers = curl_slist_append(headers, "Accept: application/json");
|
||||
|
||||
struct response_buffer response = {NULL, 0};
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, "http://localhost:8000/chunk");
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body);
|
||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, (long)strlen(body));
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
|
||||
|
||||
CURLcode rc = curl_easy_perform(curl);
|
||||
if (rc != CURLE_OK) {
|
||||
fprintf(stderr, "request failed: %s\n", curl_easy_strerror(rc));
|
||||
} else {
|
||||
long status = 0;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &status);
|
||||
printf("HTTP %ld\n%s\n", status, response.data ? response.data : "(empty)");
|
||||
}
|
||||
|
||||
free(response.data);
|
||||
curl_slist_free_all(headers);
|
||||
curl_easy_cleanup(curl);
|
||||
curl_global_cleanup();
|
||||
return rc == CURLE_OK ? 0 : 1;
|
||||
}
|
||||
```
|
||||
65
docs/snippets/c/api/client_extract_single_file.md
Normal file
65
docs/snippets/c/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,65 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```c title="C"
|
||||
#include <curl/curl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
struct response_buffer {
|
||||
char *data;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static size_t write_callback(void *contents, size_t size, size_t nmemb, void *userp) {
|
||||
size_t total = size * nmemb;
|
||||
struct response_buffer *buf = (struct response_buffer *)userp;
|
||||
char *resized = realloc(buf->data, buf->size + total + 1);
|
||||
if (!resized) {
|
||||
return 0;
|
||||
}
|
||||
buf->data = resized;
|
||||
memcpy(buf->data + buf->size, contents, total);
|
||||
buf->size += total;
|
||||
buf->data[buf->size] = '\0';
|
||||
return total;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
fprintf(stderr, "curl_easy_init failed\n");
|
||||
curl_global_cleanup();
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct response_buffer response = {NULL, 0};
|
||||
|
||||
curl_mime *form = curl_mime_init(curl);
|
||||
curl_mimepart *part = curl_mime_addpart(form);
|
||||
curl_mime_name(part, "file");
|
||||
curl_mime_filedata(part, "document.pdf");
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, "http://localhost:8000/extract");
|
||||
curl_easy_setopt(curl, CURLOPT_MIMEPOST, form);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
|
||||
|
||||
CURLcode rc = curl_easy_perform(curl);
|
||||
if (rc != CURLE_OK) {
|
||||
fprintf(stderr, "request failed: %s\n", curl_easy_strerror(rc));
|
||||
} else {
|
||||
long status = 0;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &status);
|
||||
printf("HTTP %ld\n%s\n", status, response.data ? response.data : "(empty)");
|
||||
}
|
||||
|
||||
free(response.data);
|
||||
curl_mime_free(form);
|
||||
curl_easy_cleanup(curl);
|
||||
curl_global_cleanup();
|
||||
return rc == CURLE_OK ? 0 : 1;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/api/combining_all_features.md
Normal file
44
docs/snippets/c/api/combining_all_features.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
/* Combine chunking, OCR, image extraction, and Markdown output in one config. */
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"output_format\": \"markdown\","
|
||||
"\"force_ocr\": true,"
|
||||
"\"ocr\": {\"backend\": \"tesseract\", \"languages\": [\"eng\", \"deu\"]},"
|
||||
"\"chunking\": {\"chunker_type\": \"character\", \"max_characters\": 1024, \"overlap\": 128, \"trim\": true},"
|
||||
"\"images\": {\"extract_images\": true, \"target_dpi\": 300, \"inject_placeholders\": true}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
int32_t code = kreuzberg_last_error_code();
|
||||
const char *message = kreuzberg_last_error_context();
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
code, message ? message : "(no message)");
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return code != 0 ? code : 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
29
docs/snippets/c/api/error_handling.md
Normal file
29
docs/snippets/c/api/error_handling.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
/* Pass an unsupported MIME type to trigger an error. */
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_bytes_sync(NULL, 0, "application/x-unknown", config);
|
||||
if (!result) {
|
||||
int32_t code = kreuzberg_last_error_code();
|
||||
const char *message = kreuzberg_last_error_context();
|
||||
/* message is valid until the next FFI call on this thread — copy if needed. */
|
||||
fprintf(stderr, "error %d: %s\n", code, message ? message : "(no message)");
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return code != 0 ? code : 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
47
docs/snippets/c/api/error_handling_extract.md
Normal file
47
docs/snippets/c/api/error_handling_extract.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
/* Mixed-validity batch: a real PDF, a missing file, and an unsupported type. */
|
||||
const char *items_json =
|
||||
"["
|
||||
" {\"path\": \"document.pdf\"},"
|
||||
" {\"path\": \"does-not-exist.pdf\"},"
|
||||
" {\"path\": \"archive.unknownext\"}"
|
||||
"]";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
if (!config) {
|
||||
fprintf(stderr, "config init failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Returns a JSON array of ExtractionResult objects (one per input, in order),
|
||||
* or NULL on a system-level failure. Per-item errors are encoded inside
|
||||
* each result object's metadata (e.g. an "errors" array). */
|
||||
char *results_json = kreuzberg_batch_extract_files(items_json, config);
|
||||
if (!results_json) {
|
||||
int32_t code = kreuzberg_last_error_code();
|
||||
const char *message = kreuzberg_last_error_context();
|
||||
/* message is valid until the next FFI call on this thread — copy if needed. */
|
||||
fprintf(stderr, "batch extraction aborted (code %d): %s\n",
|
||||
code, message ? message : "(no message)");
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return code != 0 ? code : 1;
|
||||
}
|
||||
|
||||
/* Walk the returned JSON. A real consumer would feed this to a JSON parser
|
||||
* and inspect each result's metadata.errors / content fields. */
|
||||
size_t len = strlen(results_json);
|
||||
printf("results (%zu bytes):\n%s\n", len, results_json);
|
||||
|
||||
kreuzberg_free_string(results_json);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
35
docs/snippets/c/api/extract_bytes_async.md
Normal file
35
docs/snippets/c/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* kreuzberg_extract_bytes schedules work on the global Tokio runtime and
|
||||
* returns once extraction is complete. For true non-blocking use, call it
|
||||
* from a dedicated OS thread and synchronize via a semaphore or callback. */
|
||||
int main(void) {
|
||||
const char *text = "Hello, kreuzberg!";
|
||||
const uint8_t *bytes = (const uint8_t *)text;
|
||||
size_t len = strlen(text);
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_bytes(bytes, len, "text/plain", config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
32
docs/snippets/c/api/extract_bytes_sync.md
Normal file
32
docs/snippets/c/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *text = "Hello, kreuzberg!";
|
||||
const uint8_t *bytes = (const uint8_t *)text;
|
||||
size_t len = strlen(text);
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_bytes_sync(bytes, len, "text/plain", config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
30
docs/snippets/c/api/extract_file_async.md
Normal file
30
docs/snippets/c/api/extract_file_async.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* kreuzberg_extract_file schedules work on the global Tokio runtime and
|
||||
* returns once extraction is complete. For true non-blocking use, call it
|
||||
* from a dedicated OS thread and synchronize via a semaphore or callback. */
|
||||
int main(void) {
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
27
docs/snippets/c/api/extract_file_sync.md
Normal file
27
docs/snippets/c/api/extract_file_sync.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
25
docs/snippets/c/config/advanced_config.md
Normal file
25
docs/snippets/c/config/advanced_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
struct ConfigBuilder *builder = kreuzberg_config_builder_new();
|
||||
kreuzberg_config_builder_set_use_cache(builder, 1);
|
||||
kreuzberg_config_builder_set_include_document_structure(builder, 1);
|
||||
kreuzberg_config_builder_set_ocr(builder,
|
||||
"{\"tesseract\":{\"language\":\"eng\"}}");
|
||||
|
||||
ExtractionConfig *config = kreuzberg_config_builder_build(builder);
|
||||
|
||||
struct CExtractionResult *result =
|
||||
kreuzberg_extract_file_sync_with_config("scan.pdf",
|
||||
kreuzberg_config_to_json(config));
|
||||
if (result && result->success) {
|
||||
printf("%s\n", result->content);
|
||||
}
|
||||
|
||||
kreuzberg_free_result(result);
|
||||
kreuzberg_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
43
docs/snippets/c/config/chunking_config.md
Normal file
43
docs/snippets/c/config/chunking_config.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 1000,"
|
||||
"\"overlap\": 200,"
|
||||
"\"trim\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
38
docs/snippets/c/config/config_basic.md
Normal file
38
docs/snippets/c/config/config_basic.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"use_cache\": true,"
|
||||
"\"enable_quality_processing\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
62
docs/snippets/c/config/config_discover.md
Normal file
62
docs/snippets/c/config/config_discover.md
Normal file
@@ -0,0 +1,62 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* The C FFI does not expose config-file auto-discovery directly. Load the
|
||||
* file contents in your application and pass the JSON to
|
||||
* kreuzberg_extraction_config_from_json. For TOML/YAML, convert in your
|
||||
* application before calling the FFI. */
|
||||
static char *read_text_file(const char *path) {
|
||||
FILE *fp = fopen(path, "rb");
|
||||
if (!fp) {
|
||||
return NULL;
|
||||
}
|
||||
fseek(fp, 0, SEEK_END);
|
||||
long size = ftell(fp);
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
char *buf = (char *)malloc((size_t)size + 1);
|
||||
if (!buf) {
|
||||
fclose(fp);
|
||||
return NULL;
|
||||
}
|
||||
fread(buf, 1, (size_t)size, fp);
|
||||
buf[size] = '\0';
|
||||
fclose(fp);
|
||||
return buf;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
char *json = read_text_file("kreuzberg.json");
|
||||
KREUZBERGExtractionConfig *config = json
|
||||
? kreuzberg_extraction_config_from_json(json)
|
||||
: kreuzberg_extraction_config_default();
|
||||
free(json);
|
||||
|
||||
if (!config) {
|
||||
fprintf(stderr, "config load failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
40
docs/snippets/c/config/config_ocr.md
Normal file
40
docs/snippets/c/config/config_ocr.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"tesseract\","
|
||||
"\"language\": \"eng\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("scanned.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
48
docs/snippets/c/config/config_programmatic.md
Normal file
48
docs/snippets/c/config/config_programmatic.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"use_cache\": true,"
|
||||
"\"enable_quality_processing\": true,"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"tesseract\","
|
||||
"\"language\": \"eng+deu\","
|
||||
"\"tesseract_config\": {\"psm\": 6}"
|
||||
"},"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 1000,"
|
||||
"\"overlap\": 200"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
37
docs/snippets/c/config/document_structure_config.md
Normal file
37
docs/snippets/c/config/document_structure_config.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"include_document_structure\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
37
docs/snippets/c/config/element_based_output.md
Normal file
37
docs/snippets/c/config/element_based_output.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"result_format\": \"element_based\""
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
47
docs/snippets/c/config/embedding_config.md
Normal file
47
docs/snippets/c/config/embedding_config.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 1000,"
|
||||
"\"overlap\": 200,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"batch_size\": 16,"
|
||||
"\"normalize\": true,"
|
||||
"\"show_download_progress\": true"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
40
docs/snippets/c/config/html_output.md
Normal file
40
docs/snippets/c/config/html_output.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"output_format\": \"html\","
|
||||
"\"html_output\": {"
|
||||
"\"theme\": \"github\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
43
docs/snippets/c/config/keyword_extraction_config.md
Normal file
43
docs/snippets/c/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"keywords\": {"
|
||||
"\"algorithm\": \"yake\","
|
||||
"\"max_keywords\": 10,"
|
||||
"\"min_score\": 0.1,"
|
||||
"\"ngram_range\": [1, 3],"
|
||||
"\"language\": \"en\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
41
docs/snippets/c/config/language_detection_config.md
Normal file
41
docs/snippets/c/config/language_detection_config.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"language_detection\": {"
|
||||
"\"enabled\": true,"
|
||||
"\"min_confidence\": 0.8,"
|
||||
"\"detect_multiple\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/config/ocr_dpi_config.md
Normal file
44
docs/snippets/c/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"images\": {"
|
||||
"\"extract_images\": true,"
|
||||
"\"target_dpi\": 300,"
|
||||
"\"max_image_dimension\": 4096,"
|
||||
"\"auto_adjust_dpi\": true,"
|
||||
"\"min_dpi\": 150,"
|
||||
"\"max_dpi\": 600"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
41
docs/snippets/c/config/pdf_config.md
Normal file
41
docs/snippets/c/config/pdf_config.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"pdf_options\": {"
|
||||
"\"extract_images\": true,"
|
||||
"\"passwords\": [\"password123\"],"
|
||||
"\"extract_metadata\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("encrypted.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
45
docs/snippets/c/config/pdf_hierarchy_config.md
Normal file
45
docs/snippets/c/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"pdf_options\": {"
|
||||
"\"hierarchy\": {"
|
||||
"\"enabled\": true,"
|
||||
"\"detection_threshold\": 0.75,"
|
||||
"\"ocr_coverage_threshold\": 0.8,"
|
||||
"\"min_level\": 1,"
|
||||
"\"max_level\": 5"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
40
docs/snippets/c/config/postprocessor_config.md
Normal file
40
docs/snippets/c/config/postprocessor_config.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"postprocessor\": {"
|
||||
"\"enabled\": true,"
|
||||
"\"enabled_processors\": [\"whitespace_normalizer\", \"unicode_normalizer\"]"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
38
docs/snippets/c/config/quality_processing_config.md
Normal file
38
docs/snippets/c/config/quality_processing_config.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"enable_quality_processing\": true,"
|
||||
"\"use_cache\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/config/tesseract_config.md
Normal file
44
docs/snippets/c/config/tesseract_config.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"tesseract\","
|
||||
"\"language\": \"eng+deu\","
|
||||
"\"tesseract_config\": {"
|
||||
"\"psm\": 6,"
|
||||
"\"oem\": 3"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("scanned.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
40
docs/snippets/c/config/token_reduction_config.md
Normal file
40
docs/snippets/c/config/token_reduction_config.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"token_reduction\": {"
|
||||
"\"mode\": \"moderate\","
|
||||
"\"preserve_important_words\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
28
docs/snippets/c/getting-started/basic_usage.md
Normal file
28
docs/snippets/c/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
39
docs/snippets/c/getting-started/extract_file.md
Normal file
39
docs/snippets/c/getting-started/extract_file.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file("document.pdf", NULL, NULL);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("content:\n%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
char *tables_json = kreuzberg_extraction_result_tables(result);
|
||||
printf("tables (JSON): %s\n", tables_json ? tables_json : "[]");
|
||||
kreuzberg_free_string(tables_json);
|
||||
|
||||
KREUZBERGMetadata *metadata = kreuzberg_extraction_result_metadata(result);
|
||||
if (metadata) {
|
||||
char *title = kreuzberg_metadata_title(metadata);
|
||||
char *language = kreuzberg_metadata_language(metadata);
|
||||
printf("title: %s\n", title ? title : "(none)");
|
||||
printf("language: %s\n", language ? language : "(none)");
|
||||
kreuzberg_free_string(title);
|
||||
kreuzberg_free_string(language);
|
||||
kreuzberg_metadata_free(metadata);
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
46
docs/snippets/c/getting-started/extract_with_ocr.md
Normal file
46
docs/snippets/c/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"force_ocr\": true,"
|
||||
"\"ocr\": {\"backend\": \"tesseract\", \"language\": \"eng\"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config =
|
||||
kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config init failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file("scanned.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
char *detected_languages = kreuzberg_extraction_result_detected_languages(result);
|
||||
printf("detected languages: %s\n",
|
||||
detected_languages ? detected_languages : "(none)");
|
||||
kreuzberg_free_string(detected_languages);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
24
docs/snippets/c/getting-started/hello_world.md
Normal file
24
docs/snippets/c/getting-started/hello_world.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file("document.pdf", NULL, NULL);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
12
docs/snippets/c/getting-started/install_verify.md
Normal file
12
docs/snippets/c/getting-started/install_verify.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *version = kreuzberg_version();
|
||||
printf("kreuzberg version: %s\n", version ? version : "(unknown)");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
38
docs/snippets/c/getting-started/read_content.md
Normal file
38
docs/snippets/c/getting-started/read_content.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file("document.pdf", NULL, NULL);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
if (content) {
|
||||
printf("content length: %zu bytes\n", strlen(content));
|
||||
printf("%s\n", content);
|
||||
}
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
/* Tables are returned as a JSON array string. A real consumer would
|
||||
* feed this into a JSON parser and walk each table's grid. */
|
||||
char *tables_json = kreuzberg_extraction_result_tables(result);
|
||||
if (tables_json) {
|
||||
printf("tables JSON (%zu bytes):\n%s\n",
|
||||
strlen(tables_json), tables_json);
|
||||
} else {
|
||||
printf("tables JSON: (none)\n");
|
||||
}
|
||||
kreuzberg_free_string(tables_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
57
docs/snippets/c/llm/structured_extraction.md
Normal file
57
docs/snippets/c/llm/structured_extraction.md
Normal file
@@ -0,0 +1,57 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"structured_extraction\": {"
|
||||
"\"schema\": {"
|
||||
"\"type\": \"object\","
|
||||
"\"properties\": {"
|
||||
"\"title\": {\"type\": \"string\"},"
|
||||
"\"authors\": {\"type\": \"array\", \"items\": {\"type\": \"string\"}},"
|
||||
"\"date\": {\"type\": \"string\"}"
|
||||
"},"
|
||||
"\"required\": [\"title\", \"authors\", \"date\"],"
|
||||
"\"additionalProperties\": false"
|
||||
"},"
|
||||
"\"llm\": {\"model\": \"openai/gpt-4o-mini\"},"
|
||||
"\"strict\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("paper.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *structured = kreuzberg_extraction_result_structured_output(result);
|
||||
if (structured) {
|
||||
printf("structured output (JSON):\n%s\n", structured);
|
||||
kreuzberg_free_string(structured);
|
||||
} else {
|
||||
printf("structured output: (none)\n");
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> Requires network access to the configured LLM provider and a valid API key in the host environment.
|
||||
56
docs/snippets/c/mcp/mcp_custom_client.md
Normal file
56
docs/snippets/c/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,56 @@
|
||||
```c title="C"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/* The kreuzberg C FFI does not bundle an MCP client. Drive the kreuzberg
|
||||
* CLI's stdio MCP transport from a C host that also links libkreuzberg. */
|
||||
int main(void) {
|
||||
int request_pipe[2];
|
||||
int response_pipe[2];
|
||||
if (pipe(request_pipe) < 0 || pipe(response_pipe) < 0) {
|
||||
perror("pipe");
|
||||
return 1;
|
||||
}
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
perror("fork");
|
||||
return 1;
|
||||
}
|
||||
if (pid == 0) {
|
||||
dup2(request_pipe[0], 0);
|
||||
dup2(response_pipe[1], 1);
|
||||
close(request_pipe[1]);
|
||||
close(response_pipe[0]);
|
||||
execlp("kreuzberg", "kreuzberg", "mcp", (char *)NULL);
|
||||
perror("execlp");
|
||||
_exit(127);
|
||||
}
|
||||
|
||||
close(request_pipe[0]);
|
||||
close(response_pipe[1]);
|
||||
|
||||
const char *request =
|
||||
"{\"method\":\"tools/call\","
|
||||
"\"params\":{\"name\":\"extract_file\","
|
||||
"\"arguments\":{\"path\":\"document.pdf\",\"async\":true}}}\n";
|
||||
if (write(request_pipe[1], request, strlen(request)) < 0) {
|
||||
perror("write");
|
||||
return 1;
|
||||
}
|
||||
close(request_pipe[1]);
|
||||
|
||||
char buffer[4096];
|
||||
ssize_t bytes_read = read(response_pipe[0], buffer, sizeof(buffer) - 1);
|
||||
if (bytes_read > 0) {
|
||||
buffer[bytes_read] = '\0';
|
||||
printf("%s", buffer);
|
||||
}
|
||||
close(response_pipe[0]);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> No MCP client is exposed by libkreuzberg; this snippet drives the MCP CLI over stdio.
|
||||
29
docs/snippets/c/mcp/mcp_server_start.md
Normal file
29
docs/snippets/c/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```c title="C"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/* The kreuzberg C FFI does not embed the MCP server. Spawn the kreuzberg
|
||||
* CLI from a host process that uses libkreuzberg for in-process extraction. */
|
||||
int main(void) {
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
perror("fork");
|
||||
return 1;
|
||||
}
|
||||
if (pid == 0) {
|
||||
execlp("kreuzberg", "kreuzberg", "mcp", (char *)NULL);
|
||||
perror("execlp");
|
||||
_exit(127);
|
||||
}
|
||||
|
||||
int status = 0;
|
||||
if (waitpid(pid, &status, 0) < 0) {
|
||||
perror("waitpid");
|
||||
return 1;
|
||||
}
|
||||
return WIFEXITED(status) ? WEXITSTATUS(status) : 1;
|
||||
}
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> The MCP server is exposed only through the kreuzberg CLI; libkreuzberg's C FFI offers no MCP entry point. This snippet spawns the CLI from a host that already links against libkreuzberg.
|
||||
56
docs/snippets/c/metadata/language_detection.md
Normal file
56
docs/snippets/c/metadata/language_detection.md
Normal file
@@ -0,0 +1,56 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"language_detection\": {"
|
||||
"\"enabled\": true,"
|
||||
"\"min_confidence\": 0.9,"
|
||||
"\"detect_multiple\": false"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGMetadata *metadata = kreuzberg_extraction_result_metadata(result);
|
||||
if (metadata) {
|
||||
char *language = kreuzberg_metadata_language(metadata);
|
||||
if (language) {
|
||||
printf("Metadata language: %s\n", language);
|
||||
kreuzberg_free_string(language);
|
||||
}
|
||||
kreuzberg_metadata_free(metadata);
|
||||
}
|
||||
|
||||
char *detected_languages_json = kreuzberg_extraction_result_detected_languages(result);
|
||||
if (detected_languages_json) {
|
||||
printf("Detected languages: %s\n", detected_languages_json);
|
||||
kreuzberg_free_string(detected_languages_json);
|
||||
} else {
|
||||
printf("No languages detected\n");
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
46
docs/snippets/c/metadata/language_detection_multilingual.md
Normal file
46
docs/snippets/c/metadata/language_detection_multilingual.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"language_detection\": {"
|
||||
"\"enabled\": true,"
|
||||
"\"min_confidence\": 0.8,"
|
||||
"\"detect_multiple\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("multilingual_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *detected_languages_json = kreuzberg_extraction_result_detected_languages(result);
|
||||
if (detected_languages_json) {
|
||||
printf("Detected languages (JSON array): %s\n", detected_languages_json);
|
||||
kreuzberg_free_string(detected_languages_json);
|
||||
} else {
|
||||
printf("No languages detected\n");
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
27
docs/snippets/c/metadata/metadata.md
Normal file
27
docs/snippets/c/metadata/metadata.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
struct CExtractionResult *result = kreuzberg_extract_file_sync("document.pdf");
|
||||
if (!result || !result->success) {
|
||||
fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("Content: %s\n", result->content);
|
||||
printf("MIME: %s\n", result->mime_type);
|
||||
|
||||
if (result->language)
|
||||
printf("Language: %s\n", result->language);
|
||||
if (result->date)
|
||||
printf("Date: %s\n", result->date);
|
||||
if (result->subject)
|
||||
printf("Subject: %s\n", result->subject);
|
||||
if (result->metadata_json)
|
||||
printf("Metadata: %s\n", result->metadata_json);
|
||||
|
||||
kreuzberg_free_result(result);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
62
docs/snippets/c/metadata/page_boundaries.md
Normal file
62
docs/snippets/c/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,62 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"pages\": {"
|
||||
"\"extract_pages\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
if (content) {
|
||||
printf("Total content length: %zu bytes\n", strlen(content));
|
||||
kreuzberg_free_string(content);
|
||||
}
|
||||
|
||||
KREUZBERGMetadata *metadata = kreuzberg_extraction_result_metadata(result);
|
||||
if (metadata) {
|
||||
KREUZBERGPageStructure *pages = kreuzberg_metadata_pages(metadata);
|
||||
if (pages) {
|
||||
printf("Total pages: %zu\n", kreuzberg_page_structure_total_count(pages));
|
||||
|
||||
char *boundaries_json = kreuzberg_page_structure_boundaries(pages);
|
||||
if (boundaries_json) {
|
||||
printf("Page boundaries (JSON): %s\n", boundaries_json);
|
||||
kreuzberg_free_string(boundaries_json);
|
||||
} else {
|
||||
printf("No page boundaries available\n");
|
||||
}
|
||||
kreuzberg_page_structure_free(pages);
|
||||
} else {
|
||||
printf("No page structure available\n");
|
||||
}
|
||||
kreuzberg_metadata_free(metadata);
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
54
docs/snippets/c/metadata/page_tracking_basic.md
Normal file
54
docs/snippets/c/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,54 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"pages\": {"
|
||||
"\"extract_pages\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *pages_json = kreuzberg_extraction_result_pages(result);
|
||||
if (pages_json) {
|
||||
printf("Pages (JSON array): %s\n", pages_json);
|
||||
kreuzberg_free_string(pages_json);
|
||||
} else {
|
||||
printf("No pages available\n");
|
||||
}
|
||||
|
||||
KREUZBERGMetadata *metadata = kreuzberg_extraction_result_metadata(result);
|
||||
if (metadata) {
|
||||
KREUZBERGPageStructure *pages = kreuzberg_metadata_pages(metadata);
|
||||
if (pages) {
|
||||
printf("Total page count: %zu\n", kreuzberg_page_structure_total_count(pages));
|
||||
kreuzberg_page_structure_free(pages);
|
||||
}
|
||||
kreuzberg_metadata_free(metadata);
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
21
docs/snippets/c/metadata/tables.md
Normal file
21
docs/snippets/c/metadata/tables.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
struct CExtractionResult *result = kreuzberg_extract_file_sync("spreadsheet.xlsx");
|
||||
if (!result || !result->success) {
|
||||
fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (result->tables_json) {
|
||||
printf("Tables (JSON): %s\n", result->tables_json);
|
||||
} else {
|
||||
printf("No tables found\n");
|
||||
}
|
||||
|
||||
kreuzberg_free_result(result);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
60
docs/snippets/c/metadata/vector_database_integration.md
Normal file
60
docs/snippets/c/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 512,"
|
||||
"\"overlap\": 50,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
if (chunks_json) {
|
||||
printf("Chunks with embeddings (JSON): %s\n", chunks_json);
|
||||
kreuzberg_free_string(chunks_json);
|
||||
} else {
|
||||
printf("No chunks produced\n");
|
||||
}
|
||||
|
||||
KREUZBERGMetadata *metadata = kreuzberg_extraction_result_metadata(result);
|
||||
if (metadata) {
|
||||
char *title = kreuzberg_metadata_title(metadata);
|
||||
if (title) {
|
||||
printf("Document title: %s\n", title);
|
||||
kreuzberg_free_string(title);
|
||||
}
|
||||
kreuzberg_metadata_free(metadata);
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
42
docs/snippets/c/ocr/cloud_ocr_backend.md
Normal file
42
docs/snippets/c/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
/* Cloud OCR backends are registered as custom plugins via the Rust core. */
|
||||
/* Select a registered cloud backend by name through the OCR config. */
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"cloud-ocr\","
|
||||
"\"language\": \"eng\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("scanned.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
39
docs/snippets/c/ocr/image_extraction.md
Normal file
39
docs/snippets/c/ocr/image_extraction.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"images\": {"
|
||||
"\"extract_images\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/ocr/image_preprocessing.md
Normal file
44
docs/snippets/c/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"images\": {"
|
||||
"\"extract_images\": true,"
|
||||
"\"target_dpi\": 300,"
|
||||
"\"max_image_dimension\": 4096,"
|
||||
"\"auto_adjust_dpi\": true,"
|
||||
"\"min_dpi\": 150,"
|
||||
"\"max_dpi\": 600"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
40
docs/snippets/c/ocr/ocr_easyocr.md
Normal file
40
docs/snippets/c/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"easyocr\","
|
||||
"\"language\": \"en\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
43
docs/snippets/c/ocr/ocr_elements.md
Normal file
43
docs/snippets/c/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"paddleocr\","
|
||||
"\"language\": \"en\","
|
||||
"\"element_config\": {"
|
||||
"\"include_elements\": true"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("scanned.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
26
docs/snippets/c/ocr/ocr_extraction.md
Normal file
26
docs/snippets/c/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
struct ConfigBuilder *builder = kreuzberg_config_builder_new();
|
||||
kreuzberg_config_builder_set_ocr(builder,
|
||||
"{\"tesseract\":{\"language\":\"eng\"}}");
|
||||
ExtractionConfig *config = kreuzberg_config_builder_build(builder);
|
||||
|
||||
char *config_json = kreuzberg_config_to_json(config);
|
||||
struct CExtractionResult *result =
|
||||
kreuzberg_extract_file_sync_with_config("scanned.png", config_json);
|
||||
|
||||
if (result && result->success) {
|
||||
printf("OCR text: %s\n", result->content);
|
||||
} else {
|
||||
fprintf(stderr, "OCR error: %s\n", kreuzberg_get_error_details().message);
|
||||
}
|
||||
|
||||
kreuzberg_free_result(result);
|
||||
kreuzberg_free_string(config_json);
|
||||
kreuzberg_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
41
docs/snippets/c/ocr/ocr_force_all_pages.md
Normal file
41
docs/snippets/c/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"force_ocr\": true,"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"tesseract\","
|
||||
"\"language\": \"eng\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
40
docs/snippets/c/ocr/ocr_multi_language.md
Normal file
40
docs/snippets/c/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"tesseract\","
|
||||
"\"language\": \"eng+deu+fra\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("multilingual.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
40
docs/snippets/c/ocr/ocr_paddleocr.md
Normal file
40
docs/snippets/c/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"ocr\": {"
|
||||
"\"backend\": \"paddleocr\","
|
||||
"\"language\": \"en\""
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
30
docs/snippets/c/plugins/clear_plugins.md
Normal file
30
docs/snippets/c/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void) {
|
||||
if (kreuzberg_clear_post_processors() != 0) {
|
||||
fprintf(stderr, "clear post-processors failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (kreuzberg_clear_ocr_backends() != 0) {
|
||||
fprintf(stderr, "clear OCR backends failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (kreuzberg_clear_validators() != 0) {
|
||||
fprintf(stderr, "clear validators failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("All plugins cleared\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
115
docs/snippets/c/plugins/embedding_backend.md
Normal file
115
docs/snippets/c/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,115 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Fixed embedding dimension produced by this backend. */
|
||||
#define EMBED_DIM 768
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static uintptr_t dimensions_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return (uintptr_t)EMBED_DIM;
|
||||
}
|
||||
|
||||
static int32_t embed_fn(
|
||||
const void *user_data,
|
||||
const char *texts,
|
||||
char **out_result,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)out_error;
|
||||
|
||||
/* `texts` is a JSON array of strings. Count entries by scanning quotes;
|
||||
* a real backend would parse the JSON and call its host model. */
|
||||
size_t count = 0;
|
||||
int in_string = 0;
|
||||
int escape = 0;
|
||||
for (const char *p = texts; *p; ++p) {
|
||||
if (escape) {
|
||||
escape = 0;
|
||||
} else if (*p == '\\') {
|
||||
escape = 1;
|
||||
} else if (*p == '"') {
|
||||
if (!in_string) {
|
||||
in_string = 1;
|
||||
count += 1;
|
||||
} else {
|
||||
in_string = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Build a JSON array of zero vectors of length EMBED_DIM, one per input. */
|
||||
/* Worst case bytes per entry: 2 brackets + EMBED_DIM * 4 ("0.0,") + comma. */
|
||||
size_t cap = 16 + count * (EMBED_DIM * 4 + 4);
|
||||
char *json = (char *)malloc(cap);
|
||||
if (!json) {
|
||||
*out_error = dup_cstr("allocation failure");
|
||||
return 1;
|
||||
}
|
||||
size_t pos = 0;
|
||||
json[pos++] = '[';
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
if (i > 0) json[pos++] = ',';
|
||||
json[pos++] = '[';
|
||||
for (size_t d = 0; d < EMBED_DIM; ++d) {
|
||||
if (d > 0) json[pos++] = ',';
|
||||
json[pos++] = '0';
|
||||
json[pos++] = '.';
|
||||
json[pos++] = '0';
|
||||
}
|
||||
json[pos++] = ']';
|
||||
}
|
||||
json[pos++] = ']';
|
||||
json[pos] = '\0';
|
||||
|
||||
*out_result = json;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void name_fn(const void *user_data, char **out_name) {
|
||||
(void)user_data;
|
||||
*out_name = dup_cstr("my-embedder");
|
||||
}
|
||||
|
||||
static void version_fn(const void *user_data, char **out_version) {
|
||||
(void)user_data;
|
||||
*out_version = dup_cstr("1.0.0");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGKreuzbergEmbeddingBackendVTable vtable = {0};
|
||||
vtable.name_fn = name_fn;
|
||||
vtable.version_fn = version_fn;
|
||||
vtable.dimensions = dimensions_fn;
|
||||
vtable.embed = embed_fn;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_embedding_backend(
|
||||
"my-embedder",
|
||||
vtable,
|
||||
NULL,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register embedding backend failed: %s\n",
|
||||
err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("my-embedder registered (dim=%d)\n", EMBED_DIM);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
27
docs/snippets/c/plugins/extractor_registration.md
Normal file
27
docs/snippets/c/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
* The kreuzberg C FFI does not expose a public function for registering
|
||||
* custom DocumentExtractor implementations from C. Document extractors must
|
||||
* be registered from Rust via `kreuzberg::plugins::registry::get_document_extractor_registry()`
|
||||
* before the C library is loaded.
|
||||
*
|
||||
* From C you can inspect which extractors the core has registered:
|
||||
*/
|
||||
|
||||
int main(void) {
|
||||
char *json = kreuzberg_list_document_extractors();
|
||||
if (!json) {
|
||||
fprintf(stderr, "list document extractors failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("Registered document extractors: %s\n", json);
|
||||
kreuzberg_free_string(json);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
25
docs/snippets/c/plugins/list_plugins.md
Normal file
25
docs/snippets/c/plugins/list_plugins.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
static void print_plugin_list(const char *label, char *json) {
|
||||
if (!json) {
|
||||
fprintf(stderr, "list %s failed (code %d): %s\n",
|
||||
label,
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return;
|
||||
}
|
||||
printf("%s: %s\n", label, json);
|
||||
kreuzberg_free_string(json);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
print_plugin_list("document extractors", kreuzberg_list_document_extractors());
|
||||
print_plugin_list("OCR backends", kreuzberg_list_ocr_backends());
|
||||
print_plugin_list("post-processors", kreuzberg_list_post_processors());
|
||||
print_plugin_list("validators", kreuzberg_list_validators());
|
||||
print_plugin_list("embedding presets", kreuzberg_list_embedding_presets());
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
90
docs/snippets/c/plugins/min_length_validator.md
Normal file
90
docs/snippets/c/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,90 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* user_data carries the minimum length threshold. */
|
||||
typedef struct {
|
||||
size_t min_length;
|
||||
} MinLengthState;
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t validate_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)config;
|
||||
const MinLengthState *state = (const MinLengthState *)user_data;
|
||||
|
||||
/* `result` is a JSON string of ExtractionResult. We approximate the content
|
||||
* length check by scanning for the "content" field. Production plugins
|
||||
* should parse JSON properly. */
|
||||
const char *content = strstr(result, "\"content\":\"");
|
||||
size_t content_len = 0;
|
||||
if (content) {
|
||||
content += strlen("\"content\":\"");
|
||||
const char *end = strchr(content, '"');
|
||||
if (end) {
|
||||
content_len = (size_t)(end - content);
|
||||
}
|
||||
}
|
||||
|
||||
if (content_len < state->min_length) {
|
||||
char buf[128];
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Content too short: %zu < %zu characters",
|
||||
content_len,
|
||||
state->min_length);
|
||||
*out_error = dup_cstr(buf);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 100;
|
||||
}
|
||||
|
||||
static void free_user_data(void *user_data) {
|
||||
free(user_data);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
MinLengthState *state = (MinLengthState *)malloc(sizeof(MinLengthState));
|
||||
state->min_length = 100;
|
||||
|
||||
KREUZBERGKreuzbergValidatorVTable vtable = {0};
|
||||
vtable.validate = validate_fn;
|
||||
vtable.priority = priority_fn;
|
||||
vtable.free_user_data = free_user_data;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_validator(
|
||||
"min-length-validator",
|
||||
vtable,
|
||||
state,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register validator failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
free(state);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("min-length-validator registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
85
docs/snippets/c/plugins/pdf_metadata_extractor.md
Normal file
85
docs/snippets/c/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,85 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* The C FFI does not expose registration for custom DocumentExtractor
|
||||
* implementations. To add PDF-specific behaviour from C, register a
|
||||
* post-processor that runs only on PDF results and enriches them.
|
||||
*
|
||||
* The example below logs whenever the pipeline emits a PDF result, scoped
|
||||
* via the should_process hook so it never fires for other MIME types.
|
||||
*/
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
(void)out_error;
|
||||
printf("pdf-metadata-extractor: serialised PDF result is %zu bytes\n", strlen(result));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t processing_stage_fn(
|
||||
const void *user_data,
|
||||
char **out_result
|
||||
) {
|
||||
(void)user_data;
|
||||
*out_result = dup_cstr("\"Late\"");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t should_process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
return strstr(result, "\"mime_type\":\"application/pdf\"") != NULL;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 75;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGKreuzbergPostProcessorVTable vtable = {0};
|
||||
vtable.process = process_fn;
|
||||
vtable.processing_stage = processing_stage_fn;
|
||||
vtable.should_process = should_process_fn;
|
||||
vtable.priority = priority_fn;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_post_processor(
|
||||
"pdf-metadata-extractor",
|
||||
vtable,
|
||||
NULL,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register post-processor failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("pdf-metadata-extractor registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
78
docs/snippets/c/plugins/pdf_only_processor.md
Normal file
78
docs/snippets/c/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,78 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
(void)out_error;
|
||||
|
||||
printf("pdf-only-processor: handling result of length %zu\n", strlen(result));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t processing_stage_fn(
|
||||
const void *user_data,
|
||||
char **out_result
|
||||
) {
|
||||
(void)user_data;
|
||||
*out_result = dup_cstr("\"Middle\"");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t should_process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
/* Only process PDF mime types. */
|
||||
return strstr(result, "\"mime_type\":\"application/pdf\"") != NULL;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 50;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGKreuzbergPostProcessorVTable vtable = {0};
|
||||
vtable.process = process_fn;
|
||||
vtable.processing_stage = processing_stage_fn;
|
||||
vtable.should_process = should_process_fn;
|
||||
vtable.priority = priority_fn;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_post_processor(
|
||||
"pdf-only-processor",
|
||||
vtable,
|
||||
NULL,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register post-processor failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("pdf-only-processor registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
42
docs/snippets/c/plugins/plugin_extractor.md
Normal file
42
docs/snippets/c/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* The C FFI exposes vtable-based registration for OCR backends, post-processors,
|
||||
* validators, and embedding backends. There is no public C entry point for
|
||||
* registering a custom DocumentExtractor — that must be done from Rust.
|
||||
*
|
||||
* From C you can still drive extraction for any MIME type the Rust core knows
|
||||
* how to handle. The example below feeds JSON bytes through the standard
|
||||
* extraction pipeline by passing the explicit MIME type.
|
||||
*/
|
||||
|
||||
int main(void) {
|
||||
const char *json_payload = "{\"message\":\"Hello, world!\"}";
|
||||
const uint8_t *bytes = (const uint8_t *)json_payload;
|
||||
uintptr_t bytes_len = (uintptr_t)strlen(json_payload);
|
||||
|
||||
KREUZBERGExtractionResult *result = kreuzberg_extract_bytes_sync(
|
||||
bytes,
|
||||
bytes_len,
|
||||
"application/json",
|
||||
NULL
|
||||
);
|
||||
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("Extracted JSON content: %s\n", content ? content : "(empty)");
|
||||
|
||||
kreuzberg_free_string(content);
|
||||
kreuzberg_extraction_result_free(result);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
91
docs/snippets/c/plugins/plugin_logging.md
Normal file
91
docs/snippets/c/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,91 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Demonstrates structured logging from a post-processor plugin's lifecycle
|
||||
* hooks (initialize/shutdown) and from the per-result process callback. */
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t initialize_fn(const void *user_data, char **out_error) {
|
||||
(void)user_data;
|
||||
(void)out_error;
|
||||
fprintf(stderr, "[INFO] plugin=logging-demo event=initialize\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t shutdown_fn(const void *user_data, char **out_error) {
|
||||
(void)user_data;
|
||||
(void)out_error;
|
||||
fprintf(stderr, "[INFO] plugin=logging-demo event=shutdown\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
(void)out_error;
|
||||
|
||||
size_t len = strlen(result);
|
||||
fprintf(stderr,
|
||||
"[INFO] plugin=logging-demo event=process bytes=%zu\n",
|
||||
len);
|
||||
|
||||
if (strstr(result, "\"content\":\"\"") != NULL) {
|
||||
fprintf(stderr,
|
||||
"[WARN] plugin=logging-demo event=empty_content\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t processing_stage_fn(const void *user_data, char **out_result) {
|
||||
(void)user_data;
|
||||
*out_result = dup_cstr("\"Late\"");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 50;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGKreuzbergPostProcessorVTable vtable = {0};
|
||||
vtable.initialize_fn = initialize_fn;
|
||||
vtable.shutdown_fn = shutdown_fn;
|
||||
vtable.process = process_fn;
|
||||
vtable.processing_stage = processing_stage_fn;
|
||||
vtable.priority = priority_fn;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_post_processor(
|
||||
"logging-demo",
|
||||
vtable,
|
||||
NULL,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "[ERROR] register post-processor failed: %s\n",
|
||||
err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("logging-demo post-processor registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
84
docs/snippets/c/plugins/plugin_testing.md
Normal file
84
docs/snippets/c/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,84 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Round-trip test: register a no-op validator, confirm it appears in the
|
||||
* registry list, then unregister and confirm it disappears. */
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t validate_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)result;
|
||||
(void)config;
|
||||
(void)out_error;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 50;
|
||||
}
|
||||
|
||||
static int contains_name(const char *json, const char *name) {
|
||||
if (!json || !name) {
|
||||
return 0;
|
||||
}
|
||||
return strstr(json, name) != NULL;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
const char *plugin_name = "noop-validator";
|
||||
|
||||
KREUZBERGKreuzbergValidatorVTable vtable = {0};
|
||||
vtable.validate = validate_fn;
|
||||
vtable.priority = priority_fn;
|
||||
|
||||
char *err = NULL;
|
||||
if (kreuzberg_register_validator(plugin_name, vtable, NULL, &err) != 0) {
|
||||
fprintf(stderr, "register failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *list_after_register = kreuzberg_list_validators();
|
||||
if (!contains_name(list_after_register, plugin_name)) {
|
||||
fprintf(stderr, "FAIL: validator missing after register\n");
|
||||
kreuzberg_free_string(list_after_register);
|
||||
return 1;
|
||||
}
|
||||
printf("PASS: %s present after register\n", plugin_name);
|
||||
kreuzberg_free_string(list_after_register);
|
||||
|
||||
if (kreuzberg_unregister_validator(plugin_name, &err) != 0) {
|
||||
fprintf(stderr, "unregister failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *list_after_unregister = kreuzberg_list_validators();
|
||||
if (contains_name(list_after_unregister, plugin_name)) {
|
||||
fprintf(stderr, "FAIL: validator still present after unregister\n");
|
||||
kreuzberg_free_string(list_after_unregister);
|
||||
return 1;
|
||||
}
|
||||
printf("PASS: %s absent after unregister\n", plugin_name);
|
||||
kreuzberg_free_string(list_after_unregister);
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
76
docs/snippets/c/plugins/plugin_validator.md
Normal file
76
docs/snippets/c/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,76 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Minimal Validator skeleton: implements the required `validate` function
|
||||
* and the optional `priority` and `should_validate` hooks via the C vtable.
|
||||
*/
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t validate_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
|
||||
/* Reject results whose serialised form contains a clearly forbidden token. */
|
||||
if (strstr(result, "FORBIDDEN") != NULL) {
|
||||
*out_error = dup_cstr("Content contains forbidden token");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t should_validate_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)result;
|
||||
(void)config;
|
||||
return 1; /* always run */
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 50;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGKreuzbergValidatorVTable vtable = {0};
|
||||
vtable.validate = validate_fn;
|
||||
vtable.should_validate = should_validate_fn;
|
||||
vtable.priority = priority_fn;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_validator(
|
||||
"forbidden-token-validator",
|
||||
vtable,
|
||||
NULL,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register validator failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("forbidden-token-validator registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
70
docs/snippets/c/plugins/quality_score_validator.md
Normal file
70
docs/snippets/c/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,70 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t validate_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
|
||||
/* Look for a "quality_score" key inside the metadata.additional map.
|
||||
* Production plugins should parse the JSON properly. */
|
||||
double score = 0.0;
|
||||
const char *needle = "\"quality_score\":";
|
||||
const char *found = strstr(result, needle);
|
||||
if (found) {
|
||||
score = atof(found + strlen(needle));
|
||||
}
|
||||
|
||||
if (score < 0.5) {
|
||||
char buf[128];
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Quality score too low: %.2f < 0.50", score);
|
||||
*out_error = dup_cstr(buf);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 50;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGKreuzbergValidatorVTable vtable = {0};
|
||||
vtable.validate = validate_fn;
|
||||
vtable.priority = priority_fn;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_validator(
|
||||
"quality-score-validator",
|
||||
vtable,
|
||||
NULL,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register validator failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("quality-score-validator registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
101
docs/snippets/c/plugins/stateful_plugin.md
Normal file
101
docs/snippets/c/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,101 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdatomic.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Shared state lives in `user_data` and is forwarded to every vtable callback.
|
||||
* Use atomics or a mutex if more than one thread can call into the plugin. */
|
||||
|
||||
typedef struct {
|
||||
atomic_size_t call_count;
|
||||
} StatefulState;
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t initialize_fn(const void *user_data, char **out_error) {
|
||||
(void)out_error;
|
||||
StatefulState *state = (StatefulState *)user_data;
|
||||
atomic_store(&state->call_count, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t shutdown_fn(const void *user_data, char **out_error) {
|
||||
(void)out_error;
|
||||
const StatefulState *state = (const StatefulState *)user_data;
|
||||
size_t count = atomic_load(&state->call_count);
|
||||
fprintf(stderr, "stateful-plugin: shutdown after %zu calls\n", count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)result;
|
||||
(void)config;
|
||||
(void)out_error;
|
||||
StatefulState *state = (StatefulState *)user_data;
|
||||
atomic_fetch_add(&state->call_count, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t processing_stage_fn(const void *user_data, char **out_result) {
|
||||
(void)user_data;
|
||||
*out_result = dup_cstr("\"Middle\"");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 50;
|
||||
}
|
||||
|
||||
static void free_user_data(void *user_data) {
|
||||
free(user_data);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
StatefulState *state = (StatefulState *)malloc(sizeof(StatefulState));
|
||||
if (!state) {
|
||||
return 1;
|
||||
}
|
||||
atomic_init(&state->call_count, 0);
|
||||
|
||||
KREUZBERGKreuzbergPostProcessorVTable vtable = {0};
|
||||
vtable.initialize_fn = initialize_fn;
|
||||
vtable.shutdown_fn = shutdown_fn;
|
||||
vtable.process = process_fn;
|
||||
vtable.processing_stage = processing_stage_fn;
|
||||
vtable.priority = priority_fn;
|
||||
vtable.free_user_data = free_user_data;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_post_processor(
|
||||
"stateful-plugin",
|
||||
vtable,
|
||||
state,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register post-processor failed: %s\n",
|
||||
err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
free(state);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("stateful-plugin registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
31
docs/snippets/c/plugins/unregister_plugins.md
Normal file
31
docs/snippets/c/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
static int unregister_or_log(
|
||||
int32_t (*unregister_fn)(const char *, char **),
|
||||
const char *kind,
|
||||
const char *name
|
||||
) {
|
||||
char *err = NULL;
|
||||
int32_t rc = unregister_fn(name, &err);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "unregister %s '%s' failed: %s\n",
|
||||
kind,
|
||||
name,
|
||||
err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int failures = 0;
|
||||
failures += unregister_or_log(kreuzberg_unregister_post_processor, "post-processor", "word-count");
|
||||
failures += unregister_or_log(kreuzberg_unregister_validator, "validator", "min-length-validator");
|
||||
failures += unregister_or_log(kreuzberg_unregister_ocr_backend, "OCR backend", "my-ocr");
|
||||
failures += unregister_or_log(kreuzberg_unregister_embedding_backend, "embedding backend", "my-embedder");
|
||||
return failures == 0 ? 0 : 1;
|
||||
}
|
||||
```
|
||||
92
docs/snippets/c/plugins/word_count_processor.md
Normal file
92
docs/snippets/c/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,92 @@
|
||||
```c title="C"
|
||||
#include <kreuzberg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static char *dup_cstr(const char *s) {
|
||||
size_t len = strlen(s);
|
||||
char *out = (char *)malloc(len + 1);
|
||||
if (out) {
|
||||
memcpy(out, s, len + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int32_t process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config,
|
||||
char **out_error
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
(void)out_error;
|
||||
|
||||
/* The `result` JSON string is read-only at this layer; for a real
|
||||
* mutating post-processor, decode the JSON, mutate, and serialise back
|
||||
* via the kreuzberg ExtractionResult helpers in your host language. */
|
||||
size_t words = 0;
|
||||
int in_word = 0;
|
||||
for (const char *p = result; *p; ++p) {
|
||||
if (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') {
|
||||
in_word = 0;
|
||||
} else if (!in_word) {
|
||||
in_word = 1;
|
||||
words += 1;
|
||||
}
|
||||
}
|
||||
printf("word-count: ~%zu tokens in serialised result\n", words);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t processing_stage_fn(
|
||||
const void *user_data,
|
||||
char **out_result
|
||||
) {
|
||||
(void)user_data;
|
||||
/* ProcessingStage is JSON-serialised; "Early" maps to ProcessingStage::Early. */
|
||||
*out_result = dup_cstr("\"Early\"");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t should_process_fn(
|
||||
const void *user_data,
|
||||
const char *result,
|
||||
const char *config
|
||||
) {
|
||||
(void)user_data;
|
||||
(void)config;
|
||||
/* Skip empty content. */
|
||||
return strstr(result, "\"content\":\"\"") == NULL;
|
||||
}
|
||||
|
||||
static int32_t priority_fn(const void *user_data) {
|
||||
(void)user_data;
|
||||
return 50;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
KREUZBERGKreuzbergPostProcessorVTable vtable = {0};
|
||||
vtable.process = process_fn;
|
||||
vtable.processing_stage = processing_stage_fn;
|
||||
vtable.should_process = should_process_fn;
|
||||
vtable.priority = priority_fn;
|
||||
|
||||
char *err = NULL;
|
||||
int32_t rc = kreuzberg_register_post_processor(
|
||||
"word-count",
|
||||
vtable,
|
||||
NULL,
|
||||
&err
|
||||
);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "register post-processor failed: %s\n", err ? err : "(no detail)");
|
||||
kreuzberg_free_string(err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("word-count post-processor registered\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
41
docs/snippets/c/utils/chunking.md
Normal file
41
docs/snippets/c/utils/chunking.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"max_characters\": 1500,"
|
||||
"\"overlap\": 200"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks (JSON): %s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
49
docs/snippets/c/utils/chunking_rag.md
Normal file
49
docs/snippets/c/utils/chunking_rag.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 500,"
|
||||
"\"overlap\": 50,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("research_paper.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Each chunk JSON entry contains content, embedding, and metadata
|
||||
(chunk_index, total_chunks, byte_start, byte_end). Pipe this directly
|
||||
into a vector database client. */
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks (JSON):\n%s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
48
docs/snippets/c/utils/embedding_with_chunking.md
Normal file
48
docs/snippets/c/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 1024,"
|
||||
"\"overlap\": 100,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true,"
|
||||
"\"batch_size\": 32,"
|
||||
"\"show_download_progress\": false"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks with embeddings (JSON):\n%s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
46
docs/snippets/c/utils/keyword_extraction_example.md
Normal file
46
docs/snippets/c/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"keywords\": {"
|
||||
"\"algorithm\": \"yake\","
|
||||
"\"max_keywords\": 10,"
|
||||
"\"min_score\": 0.3"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("research_paper.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *keywords_json = kreuzberg_extraction_result_extracted_keywords(result);
|
||||
if (keywords_json) {
|
||||
printf("Keywords: %s\n", keywords_json);
|
||||
kreuzberg_free_string(keywords_json);
|
||||
} else {
|
||||
printf("Keywords: (none)\n");
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
45
docs/snippets/c/utils/quality_processing_example.md
Normal file
45
docs/snippets/c/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"enable_quality_processing\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("scanned_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
double score = kreuzberg_extraction_result_quality_score(result);
|
||||
if (score < 0.5) {
|
||||
printf("Warning: low quality extraction (%.2f)\n", score);
|
||||
} else {
|
||||
printf("Quality score: %.2f\n", score);
|
||||
}
|
||||
|
||||
char *warnings_json = kreuzberg_extraction_result_processing_warnings(result);
|
||||
printf("processing warnings (JSON): %s\n", warnings_json ? warnings_json : "[]");
|
||||
kreuzberg_free_string(warnings_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
39
docs/snippets/c/utils/standalone_embed.md
Normal file
39
docs/snippets/c/utils/standalone_embed.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGEmbeddingConfig *config = kreuzberg_embedding_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Embed input is a JSON-encoded array of strings. */
|
||||
const char *texts_json = "[\"Hello, world!\", \"Kreuzberg is fast\"]";
|
||||
|
||||
char *embeddings_json = kreuzberg_embed_texts(texts_json, config);
|
||||
if (!embeddings_json) {
|
||||
fprintf(stderr, "embedding failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_embedding_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("embeddings (JSON, 2D float array):\n%s\n", embeddings_json);
|
||||
kreuzberg_free_string(embeddings_json);
|
||||
|
||||
kreuzberg_embedding_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
41
docs/snippets/c/utils/token_reduction.md
Normal file
41
docs/snippets/c/utils/token_reduction.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"token_reduction\": {"
|
||||
"\"mode\": \"moderate\","
|
||||
"\"preserve_important_words\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("reduced content:\n%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/utils/token_reduction_example.md
Normal file
44
docs/snippets/c/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"token_reduction\": {"
|
||||
"\"mode\": \"moderate\","
|
||||
"\"preserve_important_words\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("verbose_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
if (content) {
|
||||
printf("reduced content (%zu bytes):\n%s\n", strlen(content), content);
|
||||
kreuzberg_free_string(content);
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
55
docs/snippets/c/utils/vector_database_integration.md
Normal file
55
docs/snippets/c/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *document_path = "document.pdf";
|
||||
const char *document_id = "doc-001";
|
||||
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 512,"
|
||||
"\"overlap\": 50,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true,"
|
||||
"\"batch_size\": 32"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync(document_path, NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* The chunks JSON array carries content + embedding + metadata for each
|
||||
chunk. Pass this directly to your vector database client (pgvector,
|
||||
Qdrant, Pinecone, etc.) along with the document_id as a metadata field. */
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("document_id: %s\n", document_id);
|
||||
printf("chunks (JSON, ready to upsert into a vector DB):\n%s\n",
|
||||
chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user