Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
```dart title="Dart"
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final languageDetection = LanguageDetectionConfig(
enabled: true,
minConfidence: 0.5,
detectMultiple: false,
);
final config = ExtractionConfig(
useCache: true,
enableQualityProcessing: true,
forceOcr: false,
disableOcr: false,
resultFormat: ResultFormat.unified,
outputFormat: OutputFormat.plain(),
includeDocumentStructure: false,
maxArchiveDepth: 3,
useLayoutForMarkdown: false, languageDetection: languageDetection,
);
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
final detected = result.detectedLanguages;
if (detected != null && detected.isNotEmpty) {
print('Primary language: ${detected.first}');
} else {
print('No language detected');
}
}
```

View File

@@ -0,0 +1,36 @@
```dart title="Dart"
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final languageDetection = LanguageDetectionConfig(
enabled: true,
minConfidence: 0.3,
detectMultiple: true,
);
final config = ExtractionConfig(
useCache: true,
enableQualityProcessing: true,
forceOcr: false,
disableOcr: false,
resultFormat: ResultFormat.unified,
outputFormat: OutputFormat.plain(),
includeDocumentStructure: false,
maxArchiveDepth: 3,
useLayoutForMarkdown: false, languageDetection: languageDetection,
);
final result = await KreuzbergBridge.extractFile('multilingual.pdf', null, config);
final detected = result.detectedLanguages;
if (detected == null || detected.isEmpty) {
print('No languages detected');
return;
}
print('Detected ${detected.length} language(s):');
for (final language in detected) {
print(' - $language');
}
}
```

View File

@@ -0,0 +1,38 @@
```dart title="Dart"
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final result = await KreuzbergBridge.extractFile('document.pdf', null);
final metadata = result.metadata;
if (metadata.title != null) {
print('Title: ${metadata.title}');
}
if (metadata.subject != null) {
print('Subject: ${metadata.subject}');
}
if (metadata.authors != null) {
print('Authors: ${metadata.authors!.join(', ')}');
}
if (metadata.keywords != null) {
print('Keywords: ${metadata.keywords!.join(', ')}');
}
if (metadata.language != null) {
print('Language: ${metadata.language}');
}
if (metadata.createdAt != null) {
print('Created: ${metadata.createdAt}');
}
if (metadata.modifiedAt != null) {
print('Modified: ${metadata.modifiedAt}');
}
if (metadata.extractionDurationMs != null) {
print('Extraction took: ${metadata.extractionDurationMs} ms');
}
for (final entry in metadata.additional.entries) {
print('Additional[${entry.key}]: ${entry.value}');
}
}
```

View File

@@ -0,0 +1,31 @@
```dart title="Dart"
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final result = await KreuzbergBridge.extractFile('document.pdf', null);
final pages = result.metadata.pages;
if (pages == null) {
print('No page structure available');
return;
}
final boundaries = pages.boundaries;
if (boundaries == null || boundaries.isEmpty) {
print('No page boundaries available');
return;
}
final content = result.content;
for (final boundary in boundaries.take(3)) {
final start = boundary.byteStart.toInt();
final end = boundary.byteEnd.toInt();
final pageText = content.substring(start, end);
final previewEnd = pageText.length < 100 ? pageText.length : 100;
print('Page ${boundary.pageNumber}:');
print(' Byte range: $start-$end');
print(' Preview: ${pageText.substring(0, previewEnd)}...');
}
}
```

View File

@@ -0,0 +1,38 @@
```dart title="Dart"
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final pageConfig = PageConfig(
extractPages: true,
insertPageMarkers: false,
markerFormat: '<!-- page {page} -->',
);
final config = ExtractionConfig(
useCache: true,
enableQualityProcessing: true,
forceOcr: false,
disableOcr: false,
resultFormat: ResultFormat.unified,
outputFormat: OutputFormat.plain(),
includeDocumentStructure: false,
maxArchiveDepth: 3,
useLayoutForMarkdown: false, pages: pageConfig,
);
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
final pages = result.pages;
if (pages == null) {
print('No per-page content available');
return;
}
for (final page in pages) {
print('Page ${page.pageNumber}:');
print(' Content: ${page.content.length} chars');
print(' Tables: ${page.tables.length}');
print(' Images: ${page.images.length}');
}
}
```

View File

@@ -0,0 +1,20 @@
```dart title="Dart"
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final result = await KreuzbergBridge.extractFile('document.pdf', null);
for (final table in result.tables) {
print('Table on page ${table.pageNumber} with ${table.cells.length} rows');
print(table.markdown);
for (final row in table.cells) {
print(row);
}
if (table.boundingBox != null) {
print('Bounding box: ${table.boundingBox}');
}
}
}
```

View File

@@ -0,0 +1,92 @@
```dart title="Dart"
import 'package:kreuzberg/kreuzberg.dart';
class VectorRecord {
final String id;
final List<double> embedding;
final String content;
final Map<String, Object?> metadata;
const VectorRecord({
required this.id,
required this.embedding,
required this.content,
required this.metadata,
});
}
void storeInVectorDatabase(List<VectorRecord> records) {
for (final record in records) {
if (record.embedding.isEmpty) {
continue;
}
print(
'Storing ${record.id}: ${record.content.length} chars, '
'${record.embedding.length} dims',
);
}
}
Future<List<VectorRecord>> extractAndVectorize(
String documentPath,
String documentId,
) async {
final embedding = EmbeddingConfig(
model: EmbeddingModelType.preset(name: 'balanced'),
normalize: true,
batchSize: 32,
showDownloadProgress: false,
);
final chunking = ChunkingConfig(
maxCharacters: 512,
overlap: 50,
trim: true,
chunkerType: ChunkerType.text,
embedding: embedding,
sizing: ChunkSizing.characters(),
prependHeadingContext: false,
);
final config = ExtractionConfig(
useCache: true,
enableQualityProcessing: true,
forceOcr: false,
disableOcr: false,
resultFormat: ResultFormat.unified,
outputFormat: OutputFormat.plain(),
includeDocumentStructure: false,
maxArchiveDepth: 3,
useLayoutForMarkdown: false, chunking: chunking,
);
final result = await KreuzbergBridge.extractFile(documentPath, null, config);
final chunks = result.chunks ?? const <Chunk>[];
final records = <VectorRecord>[];
for (var index = 0; index < chunks.length; index++) {
final chunk = chunks[index];
final embeddingValues = chunk.embedding?.toList() ?? const <double>[];
records.add(
VectorRecord(
id: '${documentId}_chunk_$index',
content: chunk.content,
embedding: embeddingValues,
metadata: {
'document_id': documentId,
'chunk_index': index,
'content_length': chunk.content.length,
},
),
);
}
storeInVectorDatabase(records);
return records;
}
Future<void> main() async {
await extractAndVectorize('document.pdf', 'doc-1');
}
```