498 lines
10 KiB
Python
498 lines
10 KiB
Python
|
|
# This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
# To regenerate: alef generate
|
||
|
|
# To verify freshness: alef verify --exit-code
|
||
|
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
"""Public API for _kreuzberg.
|
||
|
|
|
||
|
|
Version: 5.0.0-rc.3
|
||
|
|
"""
|
||
|
|
|
||
|
|
from ._kreuzberg import (
|
||
|
|
AnnotationKind,
|
||
|
|
ArchiveEntry,
|
||
|
|
BatchBytesItem,
|
||
|
|
BatchFileItem,
|
||
|
|
BBox,
|
||
|
|
BlockType,
|
||
|
|
CacheStats,
|
||
|
|
CellChange,
|
||
|
|
Chunk,
|
||
|
|
ChunkerType,
|
||
|
|
ChunkMetadata,
|
||
|
|
ChunkSizing,
|
||
|
|
ChunkType,
|
||
|
|
CodeContentMode,
|
||
|
|
ContentLayer,
|
||
|
|
ContributorRole,
|
||
|
|
DbfFieldInfo,
|
||
|
|
DetectionResult,
|
||
|
|
DetectResponse,
|
||
|
|
DiffHunk,
|
||
|
|
DiffLine,
|
||
|
|
DjotContent,
|
||
|
|
DjotImage,
|
||
|
|
DjotLink,
|
||
|
|
DocumentNode,
|
||
|
|
DocumentRelationship,
|
||
|
|
DocumentRevision,
|
||
|
|
Element,
|
||
|
|
ElementMetadata,
|
||
|
|
ElementType,
|
||
|
|
EmailAttachment,
|
||
|
|
EmailExtractionResult,
|
||
|
|
EmbeddedChanges,
|
||
|
|
EmbeddedDiff,
|
||
|
|
EmbeddedFile,
|
||
|
|
EmbeddingModelType,
|
||
|
|
EmbeddingPreset,
|
||
|
|
ErrorMetadata,
|
||
|
|
ExcelSheet,
|
||
|
|
ExcelWorkbook,
|
||
|
|
ExecutionProviderType,
|
||
|
|
ExtractedImage,
|
||
|
|
ExtractedUri,
|
||
|
|
ExtractionDiff,
|
||
|
|
ExtractionMethod,
|
||
|
|
Footnote,
|
||
|
|
FormatMetadata,
|
||
|
|
FormattedBlock,
|
||
|
|
GridCell,
|
||
|
|
HeaderMetadata,
|
||
|
|
HeadingContext,
|
||
|
|
HeadingLevel,
|
||
|
|
HierarchicalBlock,
|
||
|
|
HtmlTheme,
|
||
|
|
ImageKind,
|
||
|
|
ImageMetadataType,
|
||
|
|
ImagePreprocessingMetadata,
|
||
|
|
ImageType,
|
||
|
|
InlineElement,
|
||
|
|
InlineType,
|
||
|
|
Keyword,
|
||
|
|
KeywordAlgorithm,
|
||
|
|
LayoutClass,
|
||
|
|
LayoutDetection,
|
||
|
|
LinkMetadata,
|
||
|
|
LinkType,
|
||
|
|
ListType,
|
||
|
|
ModelPaths,
|
||
|
|
NodeContent,
|
||
|
|
OcrBackendType,
|
||
|
|
OcrBoundingGeometry,
|
||
|
|
OcrElementLevel,
|
||
|
|
OcrExtractionResult,
|
||
|
|
OcrPipelineConfig,
|
||
|
|
OcrPipelineStage,
|
||
|
|
OcrRotation,
|
||
|
|
OcrTable,
|
||
|
|
OcrTableBoundingBox,
|
||
|
|
OrientationResult,
|
||
|
|
OutputFormat,
|
||
|
|
PaddleLanguage,
|
||
|
|
PageBoundary,
|
||
|
|
PageContent,
|
||
|
|
PageHierarchy,
|
||
|
|
PageInfo,
|
||
|
|
PageStructure,
|
||
|
|
PageUnitType,
|
||
|
|
PdfAnnotation,
|
||
|
|
PdfAnnotationType,
|
||
|
|
PptxExtractionResult,
|
||
|
|
ProcessingStage,
|
||
|
|
ProcessingWarning,
|
||
|
|
PSMMode,
|
||
|
|
RecognizedTable,
|
||
|
|
ReductionLevel,
|
||
|
|
RelationshipKind,
|
||
|
|
ResultFormat,
|
||
|
|
RevisionAnchor,
|
||
|
|
RevisionKind,
|
||
|
|
StructuredData,
|
||
|
|
StructuredDataResult,
|
||
|
|
StructuredDataType,
|
||
|
|
StructuredExtractionConfig,
|
||
|
|
SupportedFormat,
|
||
|
|
TableDiff,
|
||
|
|
TableModel,
|
||
|
|
TextAnnotation,
|
||
|
|
TextDirection,
|
||
|
|
TextExtractionResult,
|
||
|
|
UriKind,
|
||
|
|
XmlExtractionResult,
|
||
|
|
YearRange,
|
||
|
|
)
|
||
|
|
from .api import (
|
||
|
|
batch_extract_bytes,
|
||
|
|
batch_extract_bytes_sync,
|
||
|
|
batch_extract_files,
|
||
|
|
batch_extract_files_sync,
|
||
|
|
clear_document_extractors,
|
||
|
|
clear_embedding_backends,
|
||
|
|
clear_ocr_backends,
|
||
|
|
clear_post_processors,
|
||
|
|
clear_renderers,
|
||
|
|
clear_validators,
|
||
|
|
compare,
|
||
|
|
detect_mime_type,
|
||
|
|
detect_mime_type_from_bytes,
|
||
|
|
embed_texts,
|
||
|
|
embed_texts_async,
|
||
|
|
extract_bytes,
|
||
|
|
extract_bytes_sync,
|
||
|
|
extract_file,
|
||
|
|
extract_file_sync,
|
||
|
|
get_embedding_preset,
|
||
|
|
get_extensions_for_mime,
|
||
|
|
list_document_extractors,
|
||
|
|
list_embedding_backends,
|
||
|
|
list_embedding_presets,
|
||
|
|
list_ocr_backends,
|
||
|
|
list_post_processors,
|
||
|
|
list_renderers,
|
||
|
|
list_validators,
|
||
|
|
register_document_extractor,
|
||
|
|
register_embedding_backend,
|
||
|
|
register_ocr_backend,
|
||
|
|
register_post_processor,
|
||
|
|
register_renderer,
|
||
|
|
register_validator,
|
||
|
|
render_pdf_page_to_png,
|
||
|
|
unregister_document_extractor,
|
||
|
|
unregister_embedding_backend,
|
||
|
|
unregister_ocr_backend,
|
||
|
|
unregister_post_processor,
|
||
|
|
unregister_renderer,
|
||
|
|
unregister_validator,
|
||
|
|
)
|
||
|
|
from .exceptions import (
|
||
|
|
CacheError,
|
||
|
|
CancelledError,
|
||
|
|
EmbeddingError,
|
||
|
|
ImageProcessingError,
|
||
|
|
IoError,
|
||
|
|
KreuzbergError,
|
||
|
|
KreuzbergTimeoutError,
|
||
|
|
LockPoisonedError,
|
||
|
|
MissingDependencyError,
|
||
|
|
OcrError,
|
||
|
|
OtherError,
|
||
|
|
ParsingError,
|
||
|
|
PluginError,
|
||
|
|
SecurityError,
|
||
|
|
SerializationError,
|
||
|
|
UnsupportedFormatError,
|
||
|
|
ValidationError,
|
||
|
|
)
|
||
|
|
from .options import (
|
||
|
|
AccelerationConfig,
|
||
|
|
ArchiveMetadata,
|
||
|
|
BibtexMetadata,
|
||
|
|
BoundingBox,
|
||
|
|
ChunkingConfig,
|
||
|
|
CitationMetadata,
|
||
|
|
ContentFilterConfig,
|
||
|
|
CoreProperties,
|
||
|
|
CsvMetadata,
|
||
|
|
DbfMetadata,
|
||
|
|
DiffOptions,
|
||
|
|
DocumentStructure,
|
||
|
|
DocxAppProperties,
|
||
|
|
DocxMetadata,
|
||
|
|
EmailConfig,
|
||
|
|
EmailMetadata,
|
||
|
|
EmbeddingConfig,
|
||
|
|
EpubMetadata,
|
||
|
|
ExcelMetadata,
|
||
|
|
ExtractionConfig,
|
||
|
|
ExtractionResult,
|
||
|
|
FictionBookMetadata,
|
||
|
|
FileExtractionConfig,
|
||
|
|
HierarchyConfig,
|
||
|
|
HtmlMetadata,
|
||
|
|
HtmlOutputConfig,
|
||
|
|
ImageExtractionConfig,
|
||
|
|
ImageMetadata,
|
||
|
|
ImagePreprocessingConfig,
|
||
|
|
JatsMetadata,
|
||
|
|
KeywordConfig,
|
||
|
|
LanguageDetectionConfig,
|
||
|
|
LayoutDetectionConfig,
|
||
|
|
LayoutRegion,
|
||
|
|
LlmConfig,
|
||
|
|
LlmUsage,
|
||
|
|
Metadata,
|
||
|
|
OcrConfidence,
|
||
|
|
OcrConfig,
|
||
|
|
OcrElement,
|
||
|
|
OcrElementConfig,
|
||
|
|
OcrMetadata,
|
||
|
|
OcrQualityThresholds,
|
||
|
|
PaddleOcrConfig,
|
||
|
|
PageConfig,
|
||
|
|
PdfConfig,
|
||
|
|
PdfMetadata,
|
||
|
|
PostProcessorConfig,
|
||
|
|
PptxAppProperties,
|
||
|
|
PptxMetadata,
|
||
|
|
PstMetadata,
|
||
|
|
RakeParams,
|
||
|
|
RevisionDelta,
|
||
|
|
SecurityLimits,
|
||
|
|
ServerConfig,
|
||
|
|
Table,
|
||
|
|
TableCell,
|
||
|
|
TableGrid,
|
||
|
|
TesseractConfig,
|
||
|
|
TextMetadata,
|
||
|
|
TokenReductionConfig,
|
||
|
|
TokenReductionOptions,
|
||
|
|
TreeSitterConfig,
|
||
|
|
TreeSitterProcessConfig,
|
||
|
|
XlsxAppProperties,
|
||
|
|
XmlMetadata,
|
||
|
|
YakeParams,
|
||
|
|
)
|
||
|
|
|
||
|
|
__all__ = [
|
||
|
|
"AccelerationConfig",
|
||
|
|
"AnnotationKind",
|
||
|
|
"ArchiveEntry",
|
||
|
|
"ArchiveMetadata",
|
||
|
|
"BBox",
|
||
|
|
"BatchBytesItem",
|
||
|
|
"BatchFileItem",
|
||
|
|
"BibtexMetadata",
|
||
|
|
"BlockType",
|
||
|
|
"BoundingBox",
|
||
|
|
"CacheError",
|
||
|
|
"CacheStats",
|
||
|
|
"CancelledError",
|
||
|
|
"CellChange",
|
||
|
|
"Chunk",
|
||
|
|
"ChunkMetadata",
|
||
|
|
"ChunkSizing",
|
||
|
|
"ChunkType",
|
||
|
|
"ChunkerType",
|
||
|
|
"ChunkingConfig",
|
||
|
|
"CitationMetadata",
|
||
|
|
"CodeContentMode",
|
||
|
|
"ContentFilterConfig",
|
||
|
|
"ContentLayer",
|
||
|
|
"ContributorRole",
|
||
|
|
"CoreProperties",
|
||
|
|
"CsvMetadata",
|
||
|
|
"DbfFieldInfo",
|
||
|
|
"DbfMetadata",
|
||
|
|
"DetectResponse",
|
||
|
|
"DetectionResult",
|
||
|
|
"DiffHunk",
|
||
|
|
"DiffLine",
|
||
|
|
"DiffOptions",
|
||
|
|
"DjotContent",
|
||
|
|
"DjotImage",
|
||
|
|
"DjotLink",
|
||
|
|
"DocumentNode",
|
||
|
|
"DocumentRelationship",
|
||
|
|
"DocumentRevision",
|
||
|
|
"DocumentStructure",
|
||
|
|
"DocxAppProperties",
|
||
|
|
"DocxMetadata",
|
||
|
|
"Element",
|
||
|
|
"ElementMetadata",
|
||
|
|
"ElementType",
|
||
|
|
"EmailAttachment",
|
||
|
|
"EmailConfig",
|
||
|
|
"EmailExtractionResult",
|
||
|
|
"EmailMetadata",
|
||
|
|
"EmbeddedChanges",
|
||
|
|
"EmbeddedDiff",
|
||
|
|
"EmbeddedFile",
|
||
|
|
"EmbeddingConfig",
|
||
|
|
"EmbeddingError",
|
||
|
|
"EmbeddingModelType",
|
||
|
|
"EmbeddingPreset",
|
||
|
|
"EpubMetadata",
|
||
|
|
"ErrorMetadata",
|
||
|
|
"ExcelMetadata",
|
||
|
|
"ExcelSheet",
|
||
|
|
"ExcelWorkbook",
|
||
|
|
"ExecutionProviderType",
|
||
|
|
"ExtractedImage",
|
||
|
|
"ExtractedUri",
|
||
|
|
"ExtractionConfig",
|
||
|
|
"ExtractionDiff",
|
||
|
|
"ExtractionMethod",
|
||
|
|
"ExtractionResult",
|
||
|
|
"FictionBookMetadata",
|
||
|
|
"FileExtractionConfig",
|
||
|
|
"Footnote",
|
||
|
|
"FormatMetadata",
|
||
|
|
"FormattedBlock",
|
||
|
|
"GridCell",
|
||
|
|
"HeaderMetadata",
|
||
|
|
"HeadingContext",
|
||
|
|
"HeadingLevel",
|
||
|
|
"HierarchicalBlock",
|
||
|
|
"HierarchyConfig",
|
||
|
|
"HtmlMetadata",
|
||
|
|
"HtmlOutputConfig",
|
||
|
|
"HtmlTheme",
|
||
|
|
"ImageExtractionConfig",
|
||
|
|
"ImageKind",
|
||
|
|
"ImageMetadata",
|
||
|
|
"ImageMetadataType",
|
||
|
|
"ImagePreprocessingConfig",
|
||
|
|
"ImagePreprocessingMetadata",
|
||
|
|
"ImageProcessingError",
|
||
|
|
"ImageType",
|
||
|
|
"InlineElement",
|
||
|
|
"InlineType",
|
||
|
|
"IoError",
|
||
|
|
"JatsMetadata",
|
||
|
|
"Keyword",
|
||
|
|
"KeywordAlgorithm",
|
||
|
|
"KeywordConfig",
|
||
|
|
"KreuzbergError",
|
||
|
|
"KreuzbergTimeoutError",
|
||
|
|
"LanguageDetectionConfig",
|
||
|
|
"LayoutClass",
|
||
|
|
"LayoutDetection",
|
||
|
|
"LayoutDetectionConfig",
|
||
|
|
"LayoutRegion",
|
||
|
|
"LinkMetadata",
|
||
|
|
"LinkType",
|
||
|
|
"ListType",
|
||
|
|
"LlmConfig",
|
||
|
|
"LlmUsage",
|
||
|
|
"LockPoisonedError",
|
||
|
|
"Metadata",
|
||
|
|
"MissingDependencyError",
|
||
|
|
"ModelPaths",
|
||
|
|
"NodeContent",
|
||
|
|
"OcrBackendType",
|
||
|
|
"OcrBoundingGeometry",
|
||
|
|
"OcrConfidence",
|
||
|
|
"OcrConfig",
|
||
|
|
"OcrElement",
|
||
|
|
"OcrElementConfig",
|
||
|
|
"OcrElementLevel",
|
||
|
|
"OcrError",
|
||
|
|
"OcrExtractionResult",
|
||
|
|
"OcrMetadata",
|
||
|
|
"OcrPipelineConfig",
|
||
|
|
"OcrPipelineStage",
|
||
|
|
"OcrQualityThresholds",
|
||
|
|
"OcrRotation",
|
||
|
|
"OcrTable",
|
||
|
|
"OcrTableBoundingBox",
|
||
|
|
"OrientationResult",
|
||
|
|
"OtherError",
|
||
|
|
"OutputFormat",
|
||
|
|
"PSMMode",
|
||
|
|
"PaddleLanguage",
|
||
|
|
"PaddleOcrConfig",
|
||
|
|
"PageBoundary",
|
||
|
|
"PageConfig",
|
||
|
|
"PageContent",
|
||
|
|
"PageHierarchy",
|
||
|
|
"PageInfo",
|
||
|
|
"PageStructure",
|
||
|
|
"PageUnitType",
|
||
|
|
"ParsingError",
|
||
|
|
"PdfAnnotation",
|
||
|
|
"PdfAnnotationType",
|
||
|
|
"PdfConfig",
|
||
|
|
"PdfMetadata",
|
||
|
|
"PluginError",
|
||
|
|
"PostProcessorConfig",
|
||
|
|
"PptxAppProperties",
|
||
|
|
"PptxExtractionResult",
|
||
|
|
"PptxMetadata",
|
||
|
|
"ProcessingStage",
|
||
|
|
"ProcessingWarning",
|
||
|
|
"PstMetadata",
|
||
|
|
"RakeParams",
|
||
|
|
"RecognizedTable",
|
||
|
|
"ReductionLevel",
|
||
|
|
"RelationshipKind",
|
||
|
|
"ResultFormat",
|
||
|
|
"RevisionAnchor",
|
||
|
|
"RevisionDelta",
|
||
|
|
"RevisionKind",
|
||
|
|
"SecurityError",
|
||
|
|
"SecurityLimits",
|
||
|
|
"SerializationError",
|
||
|
|
"ServerConfig",
|
||
|
|
"StructuredData",
|
||
|
|
"StructuredDataResult",
|
||
|
|
"StructuredDataType",
|
||
|
|
"StructuredExtractionConfig",
|
||
|
|
"SupportedFormat",
|
||
|
|
"Table",
|
||
|
|
"TableCell",
|
||
|
|
"TableDiff",
|
||
|
|
"TableGrid",
|
||
|
|
"TableModel",
|
||
|
|
"TesseractConfig",
|
||
|
|
"TextAnnotation",
|
||
|
|
"TextDirection",
|
||
|
|
"TextExtractionResult",
|
||
|
|
"TextMetadata",
|
||
|
|
"TokenReductionConfig",
|
||
|
|
"TokenReductionOptions",
|
||
|
|
"TreeSitterConfig",
|
||
|
|
"TreeSitterProcessConfig",
|
||
|
|
"UnsupportedFormatError",
|
||
|
|
"UriKind",
|
||
|
|
"ValidationError",
|
||
|
|
"XlsxAppProperties",
|
||
|
|
"XmlExtractionResult",
|
||
|
|
"XmlMetadata",
|
||
|
|
"YakeParams",
|
||
|
|
"YearRange",
|
||
|
|
"batch_extract_bytes",
|
||
|
|
"batch_extract_bytes_sync",
|
||
|
|
"batch_extract_files",
|
||
|
|
"batch_extract_files_sync",
|
||
|
|
"clear_document_extractors",
|
||
|
|
"clear_embedding_backends",
|
||
|
|
"clear_ocr_backends",
|
||
|
|
"clear_post_processors",
|
||
|
|
"clear_renderers",
|
||
|
|
"clear_validators",
|
||
|
|
"compare",
|
||
|
|
"detect_mime_type",
|
||
|
|
"detect_mime_type_from_bytes",
|
||
|
|
"embed_texts",
|
||
|
|
"embed_texts_async",
|
||
|
|
"extract_bytes",
|
||
|
|
"extract_bytes_sync",
|
||
|
|
"extract_file",
|
||
|
|
"extract_file_sync",
|
||
|
|
"get_embedding_preset",
|
||
|
|
"get_extensions_for_mime",
|
||
|
|
"list_document_extractors",
|
||
|
|
"list_embedding_backends",
|
||
|
|
"list_embedding_presets",
|
||
|
|
"list_ocr_backends",
|
||
|
|
"list_post_processors",
|
||
|
|
"list_renderers",
|
||
|
|
"list_validators",
|
||
|
|
"register_document_extractor",
|
||
|
|
"register_embedding_backend",
|
||
|
|
"register_ocr_backend",
|
||
|
|
"register_post_processor",
|
||
|
|
"register_renderer",
|
||
|
|
"register_validator",
|
||
|
|
"render_pdf_page_to_png",
|
||
|
|
"unregister_document_extractor",
|
||
|
|
"unregister_embedding_backend",
|
||
|
|
"unregister_ocr_backend",
|
||
|
|
"unregister_post_processor",
|
||
|
|
"unregister_renderer",
|
||
|
|
"unregister_validator",
|
||
|
|
]
|
||
|
|
|
||
|
|
__version__ = "5.0.0-rc.3"
|