Files
fil/docs/snippets/dart/metadata/vector_database_integration.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.2 KiB

import 'package:kreuzberg/kreuzberg.dart';

class VectorRecord {
  final String id;
  final List<double> embedding;
  final String content;
  final Map<String, Object?> metadata;

  const VectorRecord({
    required this.id,
    required this.embedding,
    required this.content,
    required this.metadata,
  });
}

void storeInVectorDatabase(List<VectorRecord> records) {
  for (final record in records) {
    if (record.embedding.isEmpty) {
      continue;
    }
    print(
      'Storing ${record.id}: ${record.content.length} chars, '
      '${record.embedding.length} dims',
    );
  }
}

Future<List<VectorRecord>> extractAndVectorize(
  String documentPath,
  String documentId,
) async {
  final embedding = EmbeddingConfig(
    model: EmbeddingModelType.preset(name: 'balanced'),
    normalize: true,
    batchSize: 32,
    showDownloadProgress: false,
  );

  final chunking = ChunkingConfig(
    maxCharacters: 512,
    overlap: 50,
    trim: true,
    chunkerType: ChunkerType.text,
    embedding: embedding,
    sizing: ChunkSizing.characters(),
    prependHeadingContext: false,
  );

  final config = ExtractionConfig(
    useCache: true,
    enableQualityProcessing: true,
    forceOcr: false,
    disableOcr: false,
    resultFormat: ResultFormat.unified,
    outputFormat: OutputFormat.plain(),
    includeDocumentStructure: false,
    maxArchiveDepth: 3,
    useLayoutForMarkdown: false,    chunking: chunking,
  );

  final result = await KreuzbergBridge.extractFile(documentPath, null, config);
  final chunks = result.chunks ?? const <Chunk>[];

  final records = <VectorRecord>[];
  for (var index = 0; index < chunks.length; index++) {
    final chunk = chunks[index];
    final embeddingValues = chunk.embedding?.toList() ?? const <double>[];

    records.add(
      VectorRecord(
        id: '${documentId}_chunk_$index',
        content: chunk.content,
        embedding: embeddingValues,
        metadata: {
          'document_id': documentId,
          'chunk_index': index,
          'content_length': chunk.content.length,
        },
      ),
    );
  }

  storeInVectorDatabase(records);
  return records;
}

Future<void> main() async {
  await extractAndVectorize('document.pdf', 'doc-1');
}