This commit is contained in:
64
docs/snippets/dart/config/advanced_config.md
Normal file
64
docs/snippets/dart/config/advanced_config.md
Normal file
@@ -0,0 +1,64 @@
|
||||
```dart title="Dart"
|
||||
import 'package:flutter_rust_bridge/flutter_rust_bridge.dart' show Int64List;
|
||||
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
ocr: const OcrConfig(
|
||||
enabled: true,
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
autoRotate: false,
|
||||
),
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
embedding: EmbeddingConfig(
|
||||
model: EmbeddingModelType.preset(name: 'balanced'),
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false,
|
||||
),
|
||||
),
|
||||
languageDetection: const LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false,
|
||||
),
|
||||
keywords: KeywordConfig(
|
||||
algorithm: KeywordAlgorithm.yake,
|
||||
maxKeywords: 10,
|
||||
minScore: 0.1,
|
||||
ngramRange: Int64List.fromList(<int>[1, 3]),
|
||||
language: 'en',
|
||||
),
|
||||
tokenReduction: const TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true,
|
||||
),
|
||||
postprocessor: const PostProcessorConfig(enabled: true),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
useLayoutForMarkdown: false,
|
||||
maxArchiveDepth: 3,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Content: ${result.content}');
|
||||
if (result.detectedLanguages != null) {
|
||||
print('Languages: ${result.detectedLanguages}');
|
||||
}
|
||||
final chunks = result.chunks ?? const [];
|
||||
print('Chunks: ${chunks.length}');
|
||||
}
|
||||
```
|
||||
32
docs/snippets/dart/config/chunking_config.md
Normal file
32
docs/snippets/dart/config/chunking_config.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final chunks = result.chunks ?? const [];
|
||||
print('Chunks: ${chunks.length}');
|
||||
for (final chunk in chunks) {
|
||||
print('Length: ${chunk.content.length}');
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/dart/config/config_basic.md
Normal file
20
docs/snippets/dart/config/config_basic.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print(result.content);
|
||||
}
|
||||
```
|
||||
22
docs/snippets/dart/config/config_discover.md
Normal file
22
docs/snippets/dart/config/config_discover.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
// Dart bindings do not expose config-file discovery. Build a default
|
||||
// ExtractionConfig in code and pass it explicitly to KreuzbergBridge.extractFile.
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print(result.content);
|
||||
}
|
||||
```
|
||||
27
docs/snippets/dart/config/config_ocr.md
Normal file
27
docs/snippets/dart/config/config_ocr.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
ocr: const OcrConfig(
|
||||
enabled: true,
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
autoRotate: false,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('scanned.pdf', null, config);
|
||||
print('Content length: ${result.content.length}');
|
||||
print('Tables detected: ${result.tables.length}');
|
||||
}
|
||||
```
|
||||
56
docs/snippets/dart/config/config_programmatic.md
Normal file
56
docs/snippets/dart/config/config_programmatic.md
Normal file
@@ -0,0 +1,56 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
ocr: const OcrConfig(
|
||||
enabled: true,
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu',
|
||||
autoRotate: false,
|
||||
tesseractConfig: TesseractConfig(
|
||||
language: 'eng+deu',
|
||||
psm: 6,
|
||||
outputFormat: 'text',
|
||||
oem: 3,
|
||||
minConfidence: 0.0,
|
||||
enableTableDetection: false,
|
||||
tableMinConfidence: 0.5,
|
||||
tableColumnThreshold: 20,
|
||||
tableRowThresholdRatio: 0.5,
|
||||
useCache: true,
|
||||
classifyUsePreAdaptedTemplates: false,
|
||||
languageModelNgramOn: false,
|
||||
tesseditDontBlkrejGoodWds: false,
|
||||
tesseditDontRowrejGoodWds: false,
|
||||
tesseditEnableDictCorrection: false,
|
||||
tesseditCharWhitelist: '',
|
||||
tesseditCharBlacklist: '',
|
||||
tesseditUsePrimaryParamsModel: false,
|
||||
textordSpaceSizeIsVariable: false,
|
||||
thresholdingMethod: false,
|
||||
),
|
||||
),
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Content length: ${result.content.length}');
|
||||
}
|
||||
```
|
||||
23
docs/snippets/dart/config/document_structure_config.md
Normal file
23
docs/snippets/dart/config/document_structure_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: true,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final document = result.document;
|
||||
if (document != null) {
|
||||
print('Document nodes: ${document.nodes.length}');
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/dart/config/element_based_output.md
Normal file
29
docs/snippets/dart/config/element_based_output.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.elementBased,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final elements = result.elements ?? const [];
|
||||
for (final element in elements) {
|
||||
print('Type: ${element.elementType}');
|
||||
final preview = element.text.substring(
|
||||
0,
|
||||
element.text.length < 100 ? element.text.length : 100,
|
||||
);
|
||||
print('Text: $preview');
|
||||
print('---');
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/dart/config/embedding_config.md
Normal file
35
docs/snippets/dart/config/embedding_config.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
chunking: const ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200,
|
||||
trim: true,
|
||||
chunkerType: ChunkerType.text,
|
||||
sizing: ChunkSizing.characters(),
|
||||
prependHeadingContext: false,
|
||||
embedding: EmbeddingConfig(
|
||||
model: EmbeddingModelType.preset(name: 'balanced'),
|
||||
normalize: true,
|
||||
batchSize: 16,
|
||||
showDownloadProgress: true,
|
||||
),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final chunks = result.chunks ?? const [];
|
||||
print('Chunks with embeddings: ${chunks.length}');
|
||||
}
|
||||
```
|
||||
25
docs/snippets/dart/config/html_output.md
Normal file
25
docs/snippets/dart/config/html_output.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
htmlOutput: const HtmlOutputConfig(
|
||||
theme: HtmlTheme.gitHub,
|
||||
classPrefix: 'kb-',
|
||||
embedCss: true,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.html(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print(result.content);
|
||||
}
|
||||
```
|
||||
29
docs/snippets/dart/config/keyword_extraction_config.md
Normal file
29
docs/snippets/dart/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```dart title="Dart"
|
||||
import 'package:flutter_rust_bridge/flutter_rust_bridge.dart' show Int64List;
|
||||
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
keywords: KeywordConfig(
|
||||
algorithm: KeywordAlgorithm.yake,
|
||||
maxKeywords: 10,
|
||||
minScore: 0.1,
|
||||
ngramRange: Int64List.fromList(<int>[1, 3]),
|
||||
language: 'en',
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
useLayoutForMarkdown: false,
|
||||
maxArchiveDepth: 3,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Keywords: ${result.extractedKeywords}');
|
||||
}
|
||||
```
|
||||
25
docs/snippets/dart/config/language_detection_config.md
Normal file
25
docs/snippets/dart/config/language_detection_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
languageDetection: const LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Detected languages: ${result.detectedLanguages}');
|
||||
}
|
||||
```
|
||||
31
docs/snippets/dart/config/ocr_dpi_config.md
Normal file
31
docs/snippets/dart/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
images: const ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
targetDpi: 300,
|
||||
maxImageDimension: 4096,
|
||||
injectPlaceholders: false,
|
||||
autoAdjustDpi: true,
|
||||
minDpi: 150,
|
||||
maxDpi: 600,
|
||||
classify: false,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final images = result.images ?? const [];
|
||||
print('Extracted images: ${images.length}');
|
||||
}
|
||||
```
|
||||
32
docs/snippets/dart/config/pdf_config.md
Normal file
32
docs/snippets/dart/config/pdf_config.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
pdfOptions: const PdfConfig(
|
||||
extractImages: true,
|
||||
passwords: <String>['password123'],
|
||||
extractMetadata: true,
|
||||
extractAnnotations: false,
|
||||
allowSingleColumnTables: false,
|
||||
hierarchy: HierarchyConfig(
|
||||
enabled: true,
|
||||
kClusters: 4,
|
||||
includeBbox: false,
|
||||
),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('encrypted.pdf', null, config);
|
||||
print('Title: ${result.metadata.title}');
|
||||
}
|
||||
```
|
||||
33
docs/snippets/dart/config/pdf_hierarchy_config.md
Normal file
33
docs/snippets/dart/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
pdfOptions: const PdfConfig(
|
||||
extractImages: false,
|
||||
extractMetadata: true,
|
||||
extractAnnotations: false,
|
||||
allowSingleColumnTables: false,
|
||||
hierarchy: HierarchyConfig(
|
||||
enabled: true,
|
||||
kClusters: 5,
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: 0.8,
|
||||
),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
final pages = result.pages ?? const [];
|
||||
print('Pages with hierarchy: ${pages.where((p) => p.hierarchy != null).length}');
|
||||
}
|
||||
```
|
||||
27
docs/snippets/dart/config/postprocessor_config.md
Normal file
27
docs/snippets/dart/config/postprocessor_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
postprocessor: const PostProcessorConfig(
|
||||
enabled: true,
|
||||
enabledProcessors: <String>[
|
||||
'whitespace_normalizer',
|
||||
'unicode_normalizer',
|
||||
],
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Processed content: ${result.content}');
|
||||
}
|
||||
```
|
||||
21
docs/snippets/dart/config/quality_processing_config.md
Normal file
21
docs/snippets/dart/config/quality_processing_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Quality score: ${result.qualityScore}');
|
||||
print('Warnings: ${result.processingWarnings.length}');
|
||||
}
|
||||
```
|
||||
48
docs/snippets/dart/config/tesseract_config.md
Normal file
48
docs/snippets/dart/config/tesseract_config.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
ocr: const OcrConfig(
|
||||
enabled: true,
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu',
|
||||
autoRotate: false,
|
||||
tesseractConfig: TesseractConfig(
|
||||
language: 'eng+deu',
|
||||
psm: 6,
|
||||
outputFormat: 'text',
|
||||
oem: 3,
|
||||
minConfidence: 0.0,
|
||||
enableTableDetection: false,
|
||||
tableMinConfidence: 0.5,
|
||||
tableColumnThreshold: 20,
|
||||
tableRowThresholdRatio: 0.5,
|
||||
useCache: true,
|
||||
classifyUsePreAdaptedTemplates: false,
|
||||
languageModelNgramOn: false,
|
||||
tesseditDontBlkrejGoodWds: false,
|
||||
tesseditDontRowrejGoodWds: false,
|
||||
tesseditEnableDictCorrection: false,
|
||||
tesseditCharWhitelist: '',
|
||||
tesseditCharBlacklist: '',
|
||||
tesseditUsePrimaryParamsModel: false,
|
||||
textordSpaceSizeIsVariable: false,
|
||||
thresholdingMethod: false,
|
||||
),
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('scanned.pdf', null, config);
|
||||
print('OCR text: ${result.content}');
|
||||
}
|
||||
```
|
||||
24
docs/snippets/dart/config/token_reduction_config.md
Normal file
24
docs/snippets/dart/config/token_reduction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```dart title="Dart"
|
||||
import 'package:kreuzberg/kreuzberg.dart';
|
||||
|
||||
Future<void> main() async {
|
||||
final config = ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
forceOcr: false,
|
||||
disableOcr: false,
|
||||
tokenReduction: const TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true,
|
||||
),
|
||||
resultFormat: ResultFormat.unified,
|
||||
outputFormat: OutputFormat.plain(),
|
||||
includeDocumentStructure: false,
|
||||
maxArchiveDepth: 3,
|
||||
useLayoutForMarkdown: false,
|
||||
);
|
||||
|
||||
final result = await KreuzbergBridge.extractFile('document.pdf', null, config);
|
||||
print('Reduced content: ${result.content}');
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user