This commit is contained in:
44
docs/snippets/dart/advanced/chunk_page_mapping.md
Normal file
44
docs/snippets/dart/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
),
|
||||
pages: const PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: false,
|
||||
markerFormat: '',
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final chunks = result.chunks ?? const [];
|
||||
for (final chunk in chunks) {
|
||||
final first = chunk.metadata.firstPage;
|
||||
final last = chunk.metadata.lastPage;
|
||||
if (first != null && last != null) {
|
||||
final preview = chunk.content.length > 50
|
||||
? chunk.content.substring(0, 50)
|
||||
: chunk.content;
|
||||
final pageRange = first == last ? 'Page $first' : 'Pages $first-$last';
|
||||
print('Chunk: $preview... ($pageRange)');
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/dart/advanced/chunking_config.md
Normal file
29
docs/snippets/dart/advanced/chunking_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.markdown,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: true,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final chunks = result.chunks ?? const [];
|
||||
print('Chunks: ${chunks.length}');
|
||||
}
|
||||
```
|
||||
50
docs/snippets/dart/advanced/chunking_rag.md
Normal file
50
docs/snippets/dart/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
embedding: EmbeddingConfig(
|
||||
model: EmbeddingModelType.preset(name: 'balanced'),
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false,
|
||||
),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('research_paper.pdf', null, config);
|
||||
final chunks = result.chunks ?? const [];
|
||||
for (final chunk in chunks) {
|
||||
final index = chunk.metadata.chunkIndex;
|
||||
final total = chunk.metadata.totalChunks;
|
||||
final start = chunk.metadata.byteStart;
|
||||
final end = chunk.metadata.byteEnd;
|
||||
final preview = chunk.content.length > 100
|
||||
? chunk.content.substring(0, 100)
|
||||
: chunk.content;
|
||||
print('Chunk ${index + 1}/$total');
|
||||
print('Position: $start-$end');
|
||||
print('Content: $preview...');
|
||||
final embedding = chunk.embedding;
|
||||
if (embedding != null) {
|
||||
print('Embedding: ${embedding.length} dimensions');
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/dart/advanced/embedding_with_chunking.md
Normal file
35
docs/snippets/dart/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 1024,
|
||||
overlap: 100,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
embedding: EmbeddingConfig(
|
||||
model: EmbeddingModelType.preset(name: 'balanced'),
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false,
|
||||
),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final chunks = result.chunks ?? const [];
|
||||
print('Chunks with embeddings: ${chunks.length}');
|
||||
}
|
||||
```
|
||||
29
docs/snippets/dart/advanced/keyword_extraction_config.md
Normal file
29
docs/snippets/dart/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```dart title="Dart"
|
||||
import 'package:flutter_rust_bridge/flutter_rust_bridge.dart' show Int64List;
|
||||
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
keywords: KeywordConfig(
|
||||
algorithm: KeywordAlgorithm.yake,
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3,
|
||||
ngramRange: Int64List.fromList(<int>[1, 3]),
|
||||
language: 'en',
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
useLayoutForMarkdown: false,
|
||||
maxArchiveDepth: 3,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Keywords: ${result.extractedKeywords}');
|
||||
}
|
||||
```
|
||||
33
docs/snippets/dart/advanced/keyword_extraction_example.md
Normal file
33
docs/snippets/dart/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```dart title="Dart"
|
||||
import 'package:flutter_rust_bridge/flutter_rust_bridge.dart' show Int64List;
|
||||
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
keywords: KeywordConfig(
|
||||
algorithm: KeywordAlgorithm.yake,
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3,
|
||||
ngramRange: Int64List.fromList(<int>[1, 3]),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
useLayoutForMarkdown: false,
|
||||
maxArchiveDepth: 3,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('research_paper.pdf', null, config);
|
||||
final keywords = result.extractedKeywords;
|
||||
if (keywords != null) {
|
||||
for (final keyword in keywords) {
|
||||
print('${keyword.text} (score: ${keyword.score})');
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/dart/advanced/language_detection_config.md
Normal file
25
docs/snippets/dart/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
languageDetection: const LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Detected languages: ${result.detectedLanguages}');
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,25 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
languageDetection: const LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('multilingual_document.pdf', null, config);
|
||||
print('Detected languages: ${result.detectedLanguages}');
|
||||
}
|
||||
```
|
||||
20
docs/snippets/dart/advanced/quality_processing_config.md
Normal file
20
docs/snippets/dart/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Quality score: ${result.qualityScore}');
|
||||
}
|
||||
```
|
||||
30
docs/snippets/dart/advanced/quality_processing_example.md
Normal file
30
docs/snippets/dart/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('scanned_document.pdf', null, config);
|
||||
final score = result.qualityScore;
|
||||
if (score != null) {
|
||||
if (score < 0.5) {
|
||||
print('Warning: Low quality extraction (${score.toStringAsFixed(2)})');
|
||||
} else {
|
||||
print('Quality score: ${score.toStringAsFixed(2)}');
|
||||
}
|
||||
}
|
||||
for (final warning in result.processingWarnings) {
|
||||
print('Warning: $warning');
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/dart/advanced/token_reduction_config.md
Normal file
24
docs/snippets/dart/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
tokenReduction: const TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Reduced content length: ${result.content.length}');
|
||||
}
|
||||
```
|
||||
24
docs/snippets/dart/advanced/token_reduction_example.md
Normal file
24
docs/snippets/dart/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
tokenReduction: const TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('verbose_document.pdf', null, config);
|
||||
print('Content length after reduction: ${result.content.length}');
|
||||
}
|
||||
```
|
||||
77
docs/snippets/dart/advanced/vector_database_integration.md
Normal file
77
docs/snippets/dart/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,77 @@
|
||||
```dart title="Dart"
|
||||
import 'dart:typed_data';
|
||||
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
class VectorRecord {
|
||||
VectorRecord({
|
||||
required this.id,
|
||||
required this.content,
|
||||
required this.embedding,
|
||||
required this.metadata,
|
||||
});
|
||||
|
||||
final String id;
|
||||
final String content;
|
||||
final Float64List embedding;
|
||||
final Map<String, String> metadata;
|
||||
}
|
||||
|
||||
Future<List<VectorRecord>> extractAndVectorize(
|
||||
String documentPath,
|
||||
String documentId,
|
||||
) async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 512,
|
||||
overlap: 50,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
embedding: EmbeddingConfig(
|
||||
model: EmbeddingModelType.preset(name: 'balanced'),
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false,
|
||||
),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile(documentPath, null, config);
|
||||
final records = <VectorRecord>[];
|
||||
final chunks = result.chunks ?? const [];
|
||||
for (var index = 0; index < chunks.length; index++) {
|
||||
final chunk = chunks[index];
|
||||
final embedding = chunk.embedding;
|
||||
if (embedding == null) {
|
||||
continue;
|
||||
}
|
||||
records.add(VectorRecord(
|
||||
id: '${documentId}_chunk_$index',
|
||||
content: chunk.content,
|
||||
embedding: embedding,
|
||||
metadata: <String, String>{
|
||||
'document_id': documentId,
|
||||
'chunk_index': index.toString(),
|
||||
'content_length': chunk.content.length.toString(),
|
||||
},
|
||||
));
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
Future<void> main() async {
|
||||
final records = await extractAndVectorize('document.pdf', 'doc-001');
|
||||
print('Vector records: ${records.length}');
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user