This commit is contained in:
32
docs/snippets/dart/metadata/language_detection.md
Normal file
32
docs/snippets/dart/metadata/language_detection.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final languageDetection = LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.5,
|
||||
detectMultiple: false,
|
||||
);
|
||||
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false, languageDetection: languageDetection,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
|
||||
final detected = result.detectedLanguages;
|
||||
if (detected != null && detected.isNotEmpty) {
|
||||
print('Primary language: ${detected.first}');
|
||||
} else {
|
||||
print('No language detected');
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,36 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final languageDetection = LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.3,
|
||||
detectMultiple: true,
|
||||
);
|
||||
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false, languageDetection: languageDetection,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('multilingual.pdf', null, config);
|
||||
|
||||
final detected = result.detectedLanguages;
|
||||
if (detected == null || detected.isEmpty) {
|
||||
print('No languages detected');
|
||||
return;
|
||||
}
|
||||
|
||||
print('Detected ${detected.length} language(s):');
|
||||
for (final language in detected) {
|
||||
print(' - $language');
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/dart/metadata/metadata.md
Normal file
38
docs/snippets/dart/metadata/metadata.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null);
|
||||
|
||||
final metadata = result.metadata;
|
||||
|
||||
if (metadata.title != null) {
|
||||
print('Title: ${metadata.title}');
|
||||
}
|
||||
if (metadata.subject != null) {
|
||||
print('Subject: ${metadata.subject}');
|
||||
}
|
||||
if (metadata.authors != null) {
|
||||
print('Authors: ${metadata.authors!.join(', ')}');
|
||||
}
|
||||
if (metadata.keywords != null) {
|
||||
print('Keywords: ${metadata.keywords!.join(', ')}');
|
||||
}
|
||||
if (metadata.language != null) {
|
||||
print('Language: ${metadata.language}');
|
||||
}
|
||||
if (metadata.createdAt != null) {
|
||||
print('Created: ${metadata.createdAt}');
|
||||
}
|
||||
if (metadata.modifiedAt != null) {
|
||||
print('Modified: ${metadata.modifiedAt}');
|
||||
}
|
||||
if (metadata.extractionDurationMs != null) {
|
||||
print('Extraction took: ${metadata.extractionDurationMs} ms');
|
||||
}
|
||||
|
||||
for (final entry in metadata.additional.entries) {
|
||||
print('Additional[${entry.key}]: ${entry.value}');
|
||||
}
|
||||
}
|
||||
```
|
||||
31
docs/snippets/dart/metadata/page_boundaries.md
Normal file
31
docs/snippets/dart/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null);
|
||||
|
||||
final pages = result.metadata.pages;
|
||||
if (pages == null) {
|
||||
print('No page structure available');
|
||||
return;
|
||||
}
|
||||
|
||||
final boundaries = pages.boundaries;
|
||||
if (boundaries == null || boundaries.isEmpty) {
|
||||
print('No page boundaries available');
|
||||
return;
|
||||
}
|
||||
|
||||
final content = result.content;
|
||||
for (final boundary in boundaries.take(3)) {
|
||||
final start = boundary.byteStart.toInt();
|
||||
final end = boundary.byteEnd.toInt();
|
||||
final pageText = content.substring(start, end);
|
||||
final previewEnd = pageText.length < 100 ? pageText.length : 100;
|
||||
|
||||
print('Page ${boundary.pageNumber}:');
|
||||
print(' Byte range: $start-$end');
|
||||
print(' Preview: ${pageText.substring(0, previewEnd)}...');
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/dart/metadata/page_tracking_basic.md
Normal file
38
docs/snippets/dart/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final pageConfig = PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: false,
|
||||
markerFormat: '<!-- page {page} -->',
|
||||
);
|
||||
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false, pages: pageConfig,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
|
||||
final pages = result.pages;
|
||||
if (pages == null) {
|
||||
print('No per-page content available');
|
||||
return;
|
||||
}
|
||||
|
||||
for (final page in pages) {
|
||||
print('Page ${page.pageNumber}:');
|
||||
print(' Content: ${page.content.length} chars');
|
||||
print(' Tables: ${page.tables.length}');
|
||||
print(' Images: ${page.images.length}');
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/dart/metadata/tables.md
Normal file
20
docs/snippets/dart/metadata/tables.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null);
|
||||
|
||||
for (final table in result.tables) {
|
||||
print('Table on page ${table.pageNumber} with ${table.cells.length} rows');
|
||||
print(table.markdown);
|
||||
|
||||
for (final row in table.cells) {
|
||||
print(row);
|
||||
}
|
||||
|
||||
if (table.boundingBox != null) {
|
||||
print('Bounding box: ${table.boundingBox}');
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
92
docs/snippets/dart/metadata/vector_database_integration.md
Normal file
92
docs/snippets/dart/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,92 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
class VectorRecord {
|
||||
final String id;
|
||||
final List<double> embedding;
|
||||
final String content;
|
||||
final Map<String, Object?> metadata;
|
||||
|
||||
const VectorRecord({
|
||||
required this.id,
|
||||
required this.embedding,
|
||||
required this.content,
|
||||
required this.metadata,
|
||||
});
|
||||
}
|
||||
|
||||
void storeInVectorDatabase(List<VectorRecord> records) {
|
||||
for (final record in records) {
|
||||
if (record.embedding.isEmpty) {
|
||||
continue;
|
||||
}
|
||||
print(
|
||||
'Storing ${record.id}: ${record.content.length} chars, '
|
||||
'${record.embedding.length} dims',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Future<List<VectorRecord>> extractAndVectorize(
|
||||
String documentPath,
|
||||
String documentId,
|
||||
) async {
|
||||
final embedding = EmbeddingConfig(
|
||||
model: EmbeddingModelType.preset(name: 'balanced'),
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false,
|
||||
);
|
||||
|
||||
final chunking = ChunkingConfig(
|
||||
maxCharacters: 512,
|
||||
overlap: 50,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
embedding: embedding,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
);
|
||||
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false, chunking: chunking,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile(documentPath, null, config);
|
||||
final chunks = result.chunks ?? const <Chunk>[];
|
||||
|
||||
final records = <VectorRecord>[];
|
||||
for (var index = 0; index < chunks.length; index++) {
|
||||
final chunk = chunks[index];
|
||||
final embeddingValues = chunk.embedding?.toList() ?? const <double>[];
|
||||
|
||||
records.add(
|
||||
VectorRecord(
|
||||
id: '${documentId}_chunk_$index',
|
||||
content: chunk.content,
|
||||
embedding: embeddingValues,
|
||||
metadata: {
|
||||
'document_id': documentId,
|
||||
'chunk_index': index,
|
||||
'content_length': chunk.content.length,
|
||||
},
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
storeInVectorDatabase(records);
|
||||
return records;
|
||||
}
|
||||
|
||||
Future<void> main() async {
|
||||
await extractAndVectorize('document.pdf', 'doc-1');
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user