Files
fil/docs/snippets/dart/utils/vector_database_integration.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.0 KiB

import 'dart:typed_data';

import 'package:kreuzberg/kreuzberg.dart';

class VectorRecord {
  VectorRecord({
    required this.id,
    required this.content,
    required this.embedding,
    required this.metadata,
  });

  final String id;
  final String content;
  final Float64List embedding;
  final Map<String, String> metadata;
}

Future<List<VectorRecord>> extractAndVectorize(
  String documentPath,
  String documentId,
) async {
  final config = ExtractionConfig(
    useCache: true,
    enableQualityProcessing: true,
    forceOcr: false,
    disableOcr: false,
    chunking: const ChunkingConfig(
      maxCharacters: 512,
      overlap: 50,
      trim: true,
      chunkerType: ChunkerType.text,
      sizing: ChunkSizing.characters(),
      prependHeadingContext: false,
      embedding: EmbeddingConfig(
        model: EmbeddingModelType.preset(name: 'balanced'),
        normalize: true,
        batchSize: 32,
        showDownloadProgress: false,
      ),
    ),
    resultFormat: ResultFormat.unified,
    outputFormat: OutputFormat.plain(),
    includeDocumentStructure: false,
    maxArchiveDepth: 3,
    useLayoutForMarkdown: false,
  );

  final result = await KreuzbergBridge.extractFile(documentPath, null, config);
  final records = <VectorRecord>[];
  final chunks = result.chunks ?? const [];
  for (var index = 0; index < chunks.length; index++) {
    final chunk = chunks[index];
    final embedding = chunk.embedding;
    if (embedding == null) {
      continue;
    }
    records.add(VectorRecord(
      id: '${documentId}_chunk_$index',
      content: chunk.content,
      embedding: embedding,
      metadata: <String, String>{
        'document_id': documentId,
        'chunk_index': index.toString(),
        'content_length': chunk.content.length.toString(),
      },
    ));
  }
  return records;
}

Future<void> main() async {
  final records = await extractAndVectorize('document.pdf', 'doc-001');
  print('Vector records: ${records.length}');
}