This commit is contained in:
41
docs/snippets/c/utils/chunking.md
Normal file
41
docs/snippets/c/utils/chunking.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"max_characters\": 1500,"
|
||||
"\"overlap\": 200"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks (JSON): %s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
49
docs/snippets/c/utils/chunking_rag.md
Normal file
49
docs/snippets/c/utils/chunking_rag.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 500,"
|
||||
"\"overlap\": 50,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("research_paper.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Each chunk JSON entry contains content, embedding, and metadata
|
||||
(chunk_index, total_chunks, byte_start, byte_end). Pipe this directly
|
||||
into a vector database client. */
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks (JSON):\n%s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
48
docs/snippets/c/utils/embedding_with_chunking.md
Normal file
48
docs/snippets/c/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 1024,"
|
||||
"\"overlap\": 100,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true,"
|
||||
"\"batch_size\": 32,"
|
||||
"\"show_download_progress\": false"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("chunks with embeddings (JSON):\n%s\n", chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
46
docs/snippets/c/utils/keyword_extraction_example.md
Normal file
46
docs/snippets/c/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"keywords\": {"
|
||||
"\"algorithm\": \"yake\","
|
||||
"\"max_keywords\": 10,"
|
||||
"\"min_score\": 0.3"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("research_paper.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *keywords_json = kreuzberg_extraction_result_extracted_keywords(result);
|
||||
if (keywords_json) {
|
||||
printf("Keywords: %s\n", keywords_json);
|
||||
kreuzberg_free_string(keywords_json);
|
||||
} else {
|
||||
printf("Keywords: (none)\n");
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
45
docs/snippets/c/utils/quality_processing_example.md
Normal file
45
docs/snippets/c/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"enable_quality_processing\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("scanned_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
double score = kreuzberg_extraction_result_quality_score(result);
|
||||
if (score < 0.5) {
|
||||
printf("Warning: low quality extraction (%.2f)\n", score);
|
||||
} else {
|
||||
printf("Quality score: %.2f\n", score);
|
||||
}
|
||||
|
||||
char *warnings_json = kreuzberg_extraction_result_processing_warnings(result);
|
||||
printf("processing warnings (JSON): %s\n", warnings_json ? warnings_json : "[]");
|
||||
kreuzberg_free_string(warnings_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
39
docs/snippets/c/utils/standalone_embed.md
Normal file
39
docs/snippets/c/utils/standalone_embed.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true"
|
||||
"}";
|
||||
|
||||
KREUZBERGEmbeddingConfig *config = kreuzberg_embedding_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Embed input is a JSON-encoded array of strings. */
|
||||
const char *texts_json = "[\"Hello, world!\", \"Kreuzberg is fast\"]";
|
||||
|
||||
char *embeddings_json = kreuzberg_embed_texts(texts_json, config);
|
||||
if (!embeddings_json) {
|
||||
fprintf(stderr, "embedding failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_embedding_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("embeddings (JSON, 2D float array):\n%s\n", embeddings_json);
|
||||
kreuzberg_free_string(embeddings_json);
|
||||
|
||||
kreuzberg_embedding_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
41
docs/snippets/c/utils/token_reduction.md
Normal file
41
docs/snippets/c/utils/token_reduction.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"token_reduction\": {"
|
||||
"\"mode\": \"moderate\","
|
||||
"\"preserve_important_words\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
printf("reduced content:\n%s\n", content ? content : "(empty)");
|
||||
kreuzberg_free_string(content);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
44
docs/snippets/c/utils/token_reduction_example.md
Normal file
44
docs/snippets/c/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int main(void) {
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"token_reduction\": {"
|
||||
"\"mode\": \"moderate\","
|
||||
"\"preserve_important_words\": true"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync("verbose_document.pdf", NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *content = kreuzberg_extraction_result_content(result);
|
||||
if (content) {
|
||||
printf("reduced content (%zu bytes):\n%s\n", strlen(content), content);
|
||||
kreuzberg_free_string(content);
|
||||
}
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
55
docs/snippets/c/utils/vector_database_integration.md
Normal file
55
docs/snippets/c/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```c title="C"
|
||||
#include "kreuzberg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(void) {
|
||||
const char *document_path = "document.pdf";
|
||||
const char *document_id = "doc-001";
|
||||
|
||||
const char *config_json =
|
||||
"{"
|
||||
"\"chunking\": {"
|
||||
"\"chunker_type\": \"character\","
|
||||
"\"max_characters\": 512,"
|
||||
"\"overlap\": 50,"
|
||||
"\"embedding\": {"
|
||||
"\"model\": {\"preset\": {\"name\": \"balanced\"}},"
|
||||
"\"normalize\": true,"
|
||||
"\"batch_size\": 32"
|
||||
"}"
|
||||
"}"
|
||||
"}";
|
||||
|
||||
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_from_json(config_json);
|
||||
if (!config) {
|
||||
fprintf(stderr, "config parse failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
return 1;
|
||||
}
|
||||
|
||||
KREUZBERGExtractionResult *result =
|
||||
kreuzberg_extract_file_sync(document_path, NULL, config);
|
||||
if (!result) {
|
||||
fprintf(stderr, "extraction failed (code %d): %s\n",
|
||||
kreuzberg_last_error_code(),
|
||||
kreuzberg_last_error_context());
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* The chunks JSON array carries content + embedding + metadata for each
|
||||
chunk. Pass this directly to your vector database client (pgvector,
|
||||
Qdrant, Pinecone, etc.) along with the document_id as a metadata field. */
|
||||
char *chunks_json = kreuzberg_extraction_result_chunks(result);
|
||||
printf("document_id: %s\n", document_id);
|
||||
printf("chunks (JSON, ready to upsert into a vector DB):\n%s\n",
|
||||
chunks_json ? chunks_json : "[]");
|
||||
kreuzberg_free_string(chunks_json);
|
||||
|
||||
kreuzberg_extraction_result_free(result);
|
||||
kreuzberg_extraction_config_free(config);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user