From c1ffae6c03f47768711fd93d2780c0957f38400d Mon Sep 17 00:00:00 2001 From: khustup2 Date: Thu, 19 Dec 2024 20:30:46 +0000 Subject: [PATCH] v4.1.1 Release --- python/deeplake/__init__.py | 46 ++-- python/deeplake/__init__.pyi | 405 +++++++++++++++++++++++++++++++--- python/deeplake/core.pyi | 57 +++-- python/deeplake/tql.pyi | 4 +- python/deeplake/types.pyi | 416 ++++++++++++++++++++++++----------- 5 files changed, 721 insertions(+), 207 deletions(-) diff --git a/python/deeplake/__init__.py b/python/deeplake/__init__.py index 40a82c9082..5684486de2 100644 --- a/python/deeplake/__init__.py +++ b/python/deeplake/__init__.py @@ -1,5 +1,5 @@ import os -from typing import Callable, Any, Dict +from typing import Callable, Any, Dict, Optional try: from tqdm import tqdm as progress_bar @@ -14,7 +14,7 @@ def progress_bar(iterable, *args, **kwargs): import deeplake from ._deeplake import * -__version__ = "4.1.0" +__version__ = "4.1.1" __all__ = [ "__version__", @@ -180,7 +180,12 @@ def empty(*args, **kwargs): ) -def convert(src: str, dst: str, dst_creds: Dict[str, str] = None): +def convert( + src: str, + dst: str, + dst_creds: Optional[Dict[str, str]] = None, + token: Optional[str] = None, +) -> None: """ Copies the v3 dataset at src into a new dataset in the new v4 format. """ @@ -192,41 +197,30 @@ def get_raw_columns(source): return [ col.name for col in source.schema.columns - if not col.dtype.is_link - and col.dtype.kind - in { + if not col.dtype.is_link and col.dtype.kind in { deeplake.types.TypeKind.Image, deeplake.types.TypeKind.SegmentMask, - deeplake.types.TypeKind.BinaryMask, + deeplake.types.TypeKind.BinaryMask } ] - def transfer_non_link_data(source, dest, batch_size): - dl = deeplake._deeplake._Prefetcher( - source, - batch_size=batch_size, - adaptive=True, - raw_columns=set(get_raw_columns(source)), - ) + def transfer_non_link_data(source, dest): + dl = deeplake._deeplake._Prefetcher(source, raw_columns=set(get_raw_columns(source))) for counter, batch in enumerate(progress_bar(dl), start=1): dest.append(batch) if counter % 100 == 0: commit_data(dest) commit_data(dest, "Final commit of non-link data") - def transfer_with_links(source, dest, links, column_names, batch_size): + def transfer_with_links(source, dest, links, column_names): iterable_cols = [col for col in column_names if col not in links] link_sample_info = {link: source[link]._links_info() for link in links} dest.set_creds_key(link_sample_info[links[0]]["key"]) pref_ds = source.query(f"SELECT {','.join(iterable_cols)}") - dl = deeplake._deeplake._Prefetcher( - pref_ds, - batch_size=batch_size, - adaptive=True, - raw_columns=set(get_raw_columns(source)), - ) + dl = deeplake._deeplake._Prefetcher(pref_ds, raw_columns=set(get_raw_columns(source))) for counter, batch in enumerate(progress_bar(dl), start=1): + batch_size = len(batch[iterable_cols[0]]) for link in links: link_data = link_sample_info[link]["data"] start_index = (counter - 1) * batch_size @@ -238,8 +232,8 @@ def transfer_with_links(source, dest, links, column_names, batch_size): commit_data(dest) commit_data(dest, "Final commit of linked data") - source_ds = deeplake.query(f'select * from "{src}"') - dest_ds = deeplake.like(source_ds, dst, dst_creds) + source_ds = deeplake.query(f'select * from "{src}"', token=token) + dest_ds = deeplake.like(source_ds, dst, dst_creds, token=token) commit_data(dest_ds, "Created dataset") column_names = [col.name for col in source_ds.schema.columns] @@ -248,13 +242,11 @@ def transfer_with_links(source, dest, links, column_names, batch_size): for col in source_ds.schema.columns if source_ds.schema[col.name].dtype.is_link ] - batch_size = 10000 - print(f"Transferring {len(source_ds)} rows to {dst}...") if not links: - transfer_non_link_data(source_ds, dest_ds, batch_size) + transfer_non_link_data(source_ds, dest_ds) else: - transfer_with_links(source_ds, dest_ds, links, column_names, batch_size) + transfer_with_links(source_ds, dest_ds, links, column_names) for column in column_names: meta = dict(source_ds[column].metadata) diff --git a/python/deeplake/__init__.pyi b/python/deeplake/__init__.pyi index 541eef2078..7fb934332a 100644 --- a/python/deeplake/__init__.pyi +++ b/python/deeplake/__init__.pyi @@ -259,35 +259,158 @@ class Metadata(ReadOnlyMetadata): def query(query: str, token: str | None = None) -> DatasetView: """ - Executes the given TQL query and returns a DatasetView. + Executes a TQL (Tensor Query Language) query and returns a filtered DatasetView. + + TQL provides SQL-like querying capabilities specifically designed for ML datasets, allowing you + to filter, sort, and select data based on various criteria including vector similarity. - Compared to [deeplake.Dataset.query][], this version of query can join multiple datasets together - or query a single dataset without opening it first. + Args: + query: A TQL query string. The query can: + - Filter rows using WHERE clauses + - Sort results using ORDER BY + - Select specific columns using SELECT + - Perform vector similarity search using BM25_SIMILARITY + - Join multiple datasets + token: Optional Activeloop token for authentication. Not required if using environment + credentials. + + Returns: + DatasetView: A view containing the query results. The view can be: + - Used directly for ML training + - Further filtered with additional queries + - Converted to PyTorch/TensorFlow dataloaders + - Materialized into a new dataset Examples: + Basic filtering: ```python - r = deeplake.query("select * from \\"al://my_org/dataset\\" where id > 30") + # Select images with high confidence labels + view = deeplake.query(f'SELECT * FROM "{ds_path}" WHERE confidence > 0.9') + + # Get samples from specific classes + cats = deeplake.query(f'SELECT * FROM "{ds_path}" WHERE label IN (\'cat\', \'kitten\')') + ``` + + Text similarity search: + ```python + # Find semantically similar text using BM25 + similar = deeplake.query(f''' + SELECT * FROM "{ds_path}" + ORDER BY BM25_SIMILARITY(text_column, 'query text') DESC + LIMIT 100 + ''') + ``` + + Vector similarity search: + ```python + # Find nearest neighbor embeddings + neighbors = deeplake.query(f''' + SELECT * FROM "{ds_path}" + ORDER BY COSINE_SIMILARITY(embedding, ARRAY[0.1, 0.2, ...]) DESC + LIMIT 10 + ''') + ``` + + Joins across datasets: + ```python + # Join images with their metadata + results = deeplake.query(f''' + SELECT i.image, m.label, m.bbox + FROM "{image_ds_path}" AS i + JOIN "{metadata_ds_path}" AS m ON i.id = m.image_id + WHERE m.verified = true + ''') ``` - """ + Using with ML frameworks: + ```python + # Filter dataset and create PyTorch dataloader + train_data = deeplake.query("SELECT * FROM dataset WHERE split = 'train'") + train_loader = train_data.pytorch().dataloader(batch_size=32) + ``` + """ ... def query_async(query: str, token: str | None = None) -> Future: """ - Asynchronously executes the given TQL query and returns a Future that will resolve into DatasetView. + Asynchronously executes a TQL (Tensor Query Language) query and returns a Future that will resolve into DatasetView. + + TQL provides SQL-like querying capabilities specifically designed for ML datasets, allowing you + to filter, sort, and select data based on various criteria including vector similarity. + + Args: + query: A TQL query string. The query can: + - Filter rows using WHERE clauses + - Sort results using ORDER BY + - Select specific columns using SELECT + - Perform vector similarity search using BM25_SIMILARITY + - Join multiple datasets + token: Optional Activeloop token for authentication. Not required if using environment + credentials. + + Returns: + Future: A Future object that resolves to a DatasetView. The resulting view can be: + - Used directly for ML training + - Further filtered with additional queries + - Converted to PyTorch/TensorFlow dataloaders + - Materialized into a new dataset Examples: + Basic filtering with await: ```python - future = deeplake.query_async("select * where category == 'active'") - result = future.result() - for row in result: - print("Id is: ", row["id"]) + # Select images with high confidence labels + view = await deeplake.query_async(f'SELECT * FROM "{ds_path}" WHERE confidence > 0.9') + + # Get samples from specific classes + cats = await deeplake.query_async(f'SELECT * FROM "{ds_path}" WHERE label IN (\'cat\', \'kitten\')') + ``` - # or use the Future in an await expression - future = deeplake.query_async("select * where category == 'active'") - result = await future - for row in result: - print("Id is: ", row["id"]) + Text similarity search with Future.result(): + ```python + # Find semantically similar text using BM25 + future = deeplake.query_async(f''' + SELECT * FROM "{ds_path}" + ORDER BY BM25_SIMILARITY(text_column, 'query text') DESC + LIMIT 100 + ''') + similar = future.result() # Blocks until query completes + ``` + + Vector similarity search: + ```python + # Find nearest neighbor embeddings + neighbors = await deeplake.query_async(f''' + SELECT * FROM "{ds_path}" + ORDER BY COSINE_SIMILARITY(embedding, ARRAY[0.1, 0.2, ...]) DESC + LIMIT 10 + ''') + ``` + + Joins across datasets: + ```python + # Join images with their metadata + results = await deeplake.query_async(f''' + SELECT i.image, m.label, m.bbox + FROM "{image_ds_path}" AS i + JOIN "{metadata_ds_path}" AS m ON i.id = m.image_id + WHERE m.verified = true + ''') + ``` + + Using with ML frameworks: + ```python + # Filter dataset and create PyTorch dataloader + future = deeplake.query_async(f'SELECT * FROM "{ds_path}" WHERE split = \'train\'') + train_data = future.result() + train_loader = train_data.pytorch().dataloader(batch_size=32) + ``` + + Non-blocking check: + ```python + # Check if query is complete without blocking + future = deeplake.query_async(f'SELECT * FROM "{ds_path}"') + if future.is_completed(): + results = future.result() ``` """ ... @@ -503,25 +626,246 @@ class ColumnDefinitionView: class ColumnView: """ - Provides access to a column in a dataset. + Provides read-only access to a column in a dataset. ColumnView is designed for efficient + data access in ML workflows, supporting both synchronous and asynchronous operations. + + The ColumnView class allows you to: + - Access column data using integer indices, slices, or lists of indices + - Retrieve data asynchronously for better performance in ML pipelines + - Access column metadata and properties + - Get information about linked data if the column contains references + + Examples: + Load image data from a column for training + ```python + # Access a single image + image = dataset["images"][0] + + # Load a batch of images + batch = dataset["images"][0:32] + + # Async load for better performance + images_future = dataset["images"].get_async(0:32) + images = images_future.result() + ``` + + Access embeddings for similarity search + ```python + # Get all embeddings + embeddings = dataset["embeddings"][:] + + # Get specific embeddings by indices + selected = dataset["embeddings"][[1, 5, 10]] + ``` + + Check column properties + ```python + # Get column name + name = dataset["images"].name + + # Access metadata + if "mean" in dataset["images"].metadata: + mean = dataset["images"].metadata["mean"] + ``` """ - def __getitem__(self, index: int | slice | list | tuple) -> typing.Any: ... - def get_async(self, index: int | slice | list | tuple) -> Future: ... - def __len__(self) -> int: ... + def __getitem__(self, index: int | slice | list | tuple) -> typing.Any: + """ + Retrieve data from the column at the specified index or range. + + Parameters: + index: Can be: + - int: Single item index + - slice: Range of indices (e.g., 0:10) + - list/tuple: Multiple specific indices + + Returns: + The data at the specified index/indices. Type depends on the column's data type. + + Examples: + ```python + # Get single item + image = column[0] + + # Get range + batch = column[0:32] + + # Get specific indices + items = column[[1, 5, 10]] + ``` + """ + ... + + def get_async(self, index: int | slice | list | tuple) -> Future: + """ + Asynchronously retrieve data from the column. Useful for large datasets or when + loading multiple items in ML pipelines. + + Parameters: + index: Can be: + - int: Single item index + - slice: Range of indices + - list/tuple: Multiple specific indices + + Returns: + Future: A Future object that resolves to the requested data. + + Examples: + ```python + # Async batch load + future = column.get_async(0:32) + batch = future.result() + + # Using with async/await + batch = await column.get_async(0:32) + ``` + """ + ... + + def __len__(self) -> int: + """ + Get the number of items in the column. + + Returns: + int: Number of items in the column. + """ + ... + def __str__(self) -> str: ... - def _links_info(self) -> dict: ... + + def _links_info(self) -> dict: + """ + Get information about linked data if this column contains references to other datasets. + + Internal method used primarily for debugging and advanced operations. + + Returns: + dict: Information about linked data. + """ + ... + @property - def metadata(self) -> ReadOnlyMetadata: ... + def metadata(self) -> ReadOnlyMetadata: + """ + Access the column's metadata. Useful for storing statistics, preprocessing parameters, + or other information about the column data. + + Examples: + ```python + # Access preprocessing parameters + mean = column.metadata["mean"] + std = column.metadata["std"] + + # Check available metadata + for key in column.metadata.keys(): + print(f"{key}: {column.metadata[key]}") + ``` + """ + ... + @property - def name(self) -> str: ... + def name(self) -> str: + """ + Get the name of the column. + + Returns: + str: The column name. + """ + ... + class Column(ColumnView): - def __setitem__(self, index: int | slice, value: typing.Any) -> None: ... - def set_async(self, index: int | slice, value: typing.Any) -> FutureVoid: ... + """ + Provides read-write access to a column in a dataset. Column extends ColumnView with + methods for modifying data, making it suitable for dataset creation and updates in + ML workflows. + + The Column class allows you to: + - Read and write data using integer indices, slices, or lists of indices + - Modify data asynchronously for better performance + - Access and modify column metadata + - Handle various data types common in ML: images, embeddings, labels, etc. + + Examples: + Update training labels + ```python + # Update single label + dataset["labels"][0] = 1 + + # Update batch of labels + dataset["labels"][0:32] = new_labels + + # Async update for better performance + future = dataset["labels"].set_async(0:32, new_labels) + future.wait() + ``` + + Store image embeddings + ```python + # Generate and store embeddings + embeddings = model.encode(images) + dataset["embeddings"][0:len(embeddings)] = embeddings + ``` + + Manage column metadata + ```python + # Store preprocessing parameters + dataset["images"].metadata["mean"] = [0.485, 0.456, 0.406] + dataset["images"].metadata["std"] = [0.229, 0.224, 0.225] + ``` + """ + + def __setitem__(self, index: int | slice, value: typing.Any) -> None: + """ + Set data in the column at the specified index or range. + + Parameters: + index: Can be: + - int: Single item index + - slice: Range of indices (e.g., 0:10) + value: The data to store. Must match the column's data type. + + Examples: + ```python + # Update single item + column[0] = new_image + + # Update range + column[0:32] = new_batch + ``` + """ + ... + + def set_async(self, index: int | slice, value: typing.Any) -> FutureVoid: + """ + Asynchronously set data in the column. Useful for large updates or when + modifying multiple items in ML pipelines. + + Parameters: + index: Can be: + - int: Single item index + - slice: Range of indices + value: The data to store. Must match the column's data type. + + Returns: + FutureVoid: A FutureVoid that completes when the update is finished. + + Examples: + ```python + # Async batch update + future = column.set_async(0:32, new_batch) + future.wait() + + # Using with async/await + await column.set_async(0:32, new_batch) + ``` + """ + ... + @property def metadata(self) -> Metadata: ... + class Version: """ An atomic change within [deeplake.Dataset][]'s history @@ -855,7 +1199,7 @@ class DatasetView: ds.add_column("id", int) ds.add_column("name", str) ds.append({"id": [1,2,3], "name": ["Mary", "Joe", "Bill"]}) - + row = ds[1] print("Id:", row["id"], "Name:", row["name"]) # Output: 2 Name: Joe rows = ds[1:2] @@ -986,7 +1330,7 @@ class DatasetView: Examples: ```python from torch.utils.data import DataLoader - + ds = deeplake.open("path/to/dataset") dataloader = DataLoader(ds.pytorch(), batch_size=60, shuffle=True, num_workers=10) @@ -1015,6 +1359,7 @@ class DatasetView: """ ... + class Dataset(DatasetView): """ Datasets are the primary data structure used in DeepLake. They are used to store and manage data for searching, training, evaluation. @@ -1122,9 +1467,7 @@ class Dataset(DatasetView): """ ... - def __getitem__( - self, input: int | slice | list | tuple | str - ) -> Row | RowRange | Column: + def __getitem__(self, input: int | slice | list | tuple | str) -> Row | RowRange | Column: """ Returns a subset of data from the Dataset @@ -1867,7 +2210,7 @@ def create( ```python import deeplake from deeplake import types - + # Create a dataset in your local filesystem: ds = deeplake.create("directory_path") ds.add_column("id", types.Int32()) @@ -1936,7 +2279,7 @@ def create_async( ```python import deeplake from deeplake import types - + # Asynchronously create a dataset in your local filesystem: ds = await deeplake.create_async("directory_path") await ds.add_column("id", types.Int32()) diff --git a/python/deeplake/core.pyi b/python/deeplake/core.pyi index 6d8facfbb5..c082460cd5 100644 --- a/python/deeplake/core.pyi +++ b/python/deeplake/core.pyi @@ -8,24 +8,53 @@ import typing __all__ = ["Dict", "IndexMapping64", "MemoryBuffer"] + class Dict: - def __getstate__(self: dict) -> dict: ... - def __setstate__(self: dict, arg0: dict) -> None: ... - def __eq__(self: dict, other: dict | dict) -> bool: ... - def __getitem__(self: dict, key: str) -> typing.Any: ... - def __len__(self: dict) -> int: ... - def __ne__(self: dict, other: dict | dict) -> bool: ... - def __str__(self: dict) -> str: ... - def items(self: dict) -> list: ... - def keys(self: dict) -> list[str]: ... - def to_dict(self: dict) -> dict: ... + def __getstate__(self: dict) -> dict: + ... + + def __setstate__(self: dict, arg0: dict) -> None: + ... + + def __eq__(self: dict, other: dict | dict) -> bool: + ... + + def __getitem__(self: dict, key: str) -> typing.Any: + ... + + def __len__(self: dict) -> int: + ... + + def __ne__(self: dict, other: dict | dict) -> bool: + ... + + def __str__(self: dict) -> str: + ... + + def items(self: dict) -> list: + ... + + def keys(self: dict) -> list[str]: + ... + + def to_dict(self: dict) -> dict: + ... + class IndexMapping64: def __getitem__(self, index: int) -> int: ... - def __getstate__(self) -> tuple: ... - def __iter__(self) -> typing.Iterator[int]: ... - def __len__(self) -> int: ... - def __setstate__(self, arg0: tuple) -> None: ... + + def __getstate__(self) -> tuple: + ... + + def __iter__(self) -> typing.Iterator[int]: + ... + + def __len__(self) -> int: + ... + + def __setstate__(self, arg0: tuple) -> None: + ... class MemoryBuffer: def __buffer__(self, flags): diff --git a/python/deeplake/tql.pyi b/python/deeplake/tql.pyi index 42af0b12c7..97f0d1e43c 100644 --- a/python/deeplake/tql.pyi +++ b/python/deeplake/tql.pyi @@ -20,9 +20,9 @@ def register_function(function: typing.Callable) -> None: ```python def next_number(a): return a + 1 - + deeplake.tql.register_function(next_number) - + r = ds.query("SELECT * WHERE next_number(column_name) > 10") ``` """ diff --git a/python/deeplake/types.pyi b/python/deeplake/types.pyi index 1b46326859..896b198fc9 100644 --- a/python/deeplake/types.pyi +++ b/python/deeplake/types.pyi @@ -38,14 +38,19 @@ __all__ = [ ] class QuantizationType: - Binary: typing.ClassVar[QuantizationType] """ - Stores a binary quantized representation of the original embedding in the index rather than the a full copy of the embedding. - - This slightly decreases accuracy of searches, while significantly improving query time. + Enumeration of available quantization types for embeddings. + + Members: + Binary: + Stores a binary quantized representation of the original embedding in the index + rather than a full copy of the embedding. This slightly decreases accuracy of + searches, while significantly improving query time. """ + Binary: typing.ClassVar[QuantizationType] __members__: typing.ClassVar[dict[str, QuantizationType]] + def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: ... def __hash__(self) -> int: ... @@ -56,20 +61,41 @@ class QuantizationType: def __repr__(self) -> str: ... def __setstate__(self, state: int) -> None: ... def __str__(self) -> str: ... + @property - def name(self) -> str: ... + def name(self) -> str: + """ + Returns: + str: The name of the quantization type. + """ + ... + @property - def value(self) -> int: ... + def value(self) -> int: + """ + Returns: + int: The integer value of the quantization type. + """ + ... Binary: QuantizationType +""" +Binary quantization type for embeddings. + +This slightly decreases accuracy of searches while significantly improving query time +by storing a binary quantized representation instead of the full embedding. +""" class TextIndexType: """ - Members: + Enumeration of available text indexing types. - Inverted - - BM25 + Members: + Inverted: + A text index that supports keyword lookup. Can be used with ``CONTAINS(column, 'wanted_value')``. + BM25: + A BM25-based index of text data. Can be used with ``BM25_SIMILARITY(column, 'search text')`` + in a TQL ``ORDER BY`` clause. """ BM25: typing.ClassVar[TextIndexType] @@ -86,14 +112,28 @@ class TextIndexType: def __repr__(self) -> str: ... def __setstate__(self, state: int) -> None: ... def __str__(self) -> str: ... + @property - def name(self) -> str: ... + def name(self) -> str: + """ + Returns: + str: The name of the text index type. + """ + ... + @property - def value(self) -> int: ... + def value(self) -> int: + """ + Returns: + int: The integer value of the text index type. + """ + ... class DataType: """ The base class all specific types extend from. + + This class provides the foundation for all data types in the deeplake. """ def __eq__(self, other: DataType) -> bool: ... @@ -101,67 +141,106 @@ class DataType: def __str__(self) -> str: ... class Type: - """ """ + """ + Base class for all complex data types in the deeplake. + + This class extends DataType to provide additional functionality for complex types + like images, embeddings, and sequences. + """ def __str__(self) -> str: ... def __eq__(self, other: Type) -> bool: ... def __ne__(self, other: Type) -> bool: ... - @property(readonly=True) - def data_type(self) -> DataType: ... - @property(readonly=True) - # Temporary workaround. Need to remove `deeplake._deeplake` from the return type. - def default_format(self) -> deeplake._deeplake.formats.DataFormat: ... + + @property + def data_type(self) -> DataType: + """ + Returns: + DataType: The underlying data type of this type. + """ + ... + + @property + def default_format(self) -> deeplake._deeplake.formats.DataFormat: + """ + Returns: + DataFormat: The default format used for this type. + """ + ... + @property def id(self) -> str: """ - The id (name) of the data type + Returns: + str: The id (name) of the data type. """ ... @property - def is_sequence(self) -> bool: ... + def is_sequence(self) -> bool: + """ + Returns: + bool: True if this type is a sequence, False otherwise. + """ + ... + @property - def is_link(self) -> bool: ... + def is_link(self) -> bool: + """ + Returns: + bool: True if this type is a link, False otherwise. + """ + ... + @property - def is_image(self) -> bool: ... + def is_image(self) -> bool: + """ + Returns: + bool: True if this type is an image, False otherwise. + """ + ... + @property - def is_segment_mask(self) -> bool: ... + def is_segment_mask(self) -> bool: + """ + Returns: + bool: True if this type is a segment mask, False otherwise. + """ + ... + @property - def kind(self) -> TypeKind: ... + def kind(self) -> TypeKind: + """ + Returns: + TypeKind: The kind of this type. + """ + ... + @property def shape(self) -> list[int] | None: """ - The shape of the data type if applicable. Otherwise none + Returns: + list[int] | None: The shape of the data type if applicable, otherwise None. """ ... class TypeKind: """ - Members: - - Generic - - Text - - Dict - - Embedding - - Sequence - - Image - - BoundingBox - - BinaryMask + Enumeration of all available type kinds in the deeplake. - SegmentMask - - Polygon - - ClassLabel - - Link + Members: + Generic: Generic data type + Text: Text data type + Dict: Dictionary data type + Embedding: Embedding data type + Sequence: Sequence data type + Image: Image data type + BoundingBox: Bounding box data type + BinaryMask: Binary mask data type + SegmentMask: Segmentation mask data type + Polygon: Polygon data type + ClassLabel: Class label data type + Link: Link data type """ BinaryMask: typing.ClassVar[TypeKind] @@ -177,6 +256,7 @@ class TypeKind: Sequence: typing.ClassVar[TypeKind] Text: typing.ClassVar[TypeKind] __members__: typing.ClassVar[dict[str, TypeKind]] + def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: ... def __hash__(self) -> int: ... @@ -187,99 +267,132 @@ class TypeKind: def __repr__(self) -> str: ... def __setstate__(self, state: int) -> None: ... def __str__(self) -> str: ... + @property - def name(self) -> str: ... + def name(self) -> str: + """ + Returns: + str: The name of the type kind. + """ + ... + @property - def value(self) -> int: ... + def value(self) -> int: + """ + Returns: + int: The integer value of the type kind. + """ + ... @typing.overload def Array(dtype: DataType | str, dimensions: int) -> DataType: ... @typing.overload def Array(dtype: DataType | str, shape: list[int]) -> DataType: ... + def Array(dtype: DataType | str, dimensions: int, shape: list[int]) -> DataType: """ - A generic array of data. + Creates a generic array of data. Parameters: - dtype: The datatype of values in the array - dimensions: The number of dimensions/axies in the array. Unlike specifying `shape`, there is no constraint on the size of each dimension. - shape: Constrain the size of each dimension in the array + dtype: DataType | str + The datatype of values in the array + dimensions: int + The number of dimensions/axes in the array. Unlike specifying ``shape``, + there is no constraint on the size of each dimension. + shape: list[int] + Constrain the size of each dimension in the array + + Returns: + DataType: A new array data type with the specified parameters. Examples: - ```python - # Create a three-dimensional array, where each dimension can have any number of elements - ds.add_column("col1", types.Array("int32", dimensions=3)) + Create a three-dimensional array, where each dimension can have any number of elements:: - # Create a three-dimensional array, where each dimension has a known size - ds.add_column("col2", types.Array(types.Float32(), shape=[50, 30, 768])) - ``` + ds.add_column("col1", types.Array("int32", dimensions=3)) + + Create a three-dimensional array, where each dimension has a known size:: + + ds.add_column("col2", types.Array(types.Float32(), shape=[50, 30, 768])) """ ... def Bool() -> DataType: """ - A boolean value + Creates a boolean value type. + + Returns: + DataType: A new boolean data type. Examples: - ```python - ds.add_column("col1", types.Bool) - ds.add_column("col2", "bool") - ``` + Create columns with boolean type:: + + ds.add_column("col1", types.Bool) + ds.add_column("col2", "bool") """ ... def Text(index_type: str | TextIndexType | None = None) -> Type: """ - Text data of arbitrary length. - - Options for index_type are: - - - [deeplake.types.Inverted][] - - [deeplake.types.BM25][] + Creates a text data type of arbitrary length. Parameters: - index_type: How to index the data in the column for faster searching. Default is `None` meaning "do not index" + index_type: str | TextIndexType | None + How to index the data in the column for faster searching. + Options are: + + - :class:`deeplake.types.Inverted` + - :class:`deeplake.types.BM25` + + Default is ``None`` meaning "do not index" + + Returns: + Type: A new text data type. Examples: - ```python - ds.add_column("col1", types.Text) - ds.add_column("col2", "text") - ds.add_column("col3", str) - ds.add_column("col4", types.Text(index_type=types.Inverted)) - ds.add_column("col4", types.Text(index_type=types.BM25)) - ``` + Create text columns with different configurations:: + + ds.add_column("col1", types.Text) + ds.add_column("col2", "text") + ds.add_column("col3", str) + ds.add_column("col4", types.Text(index_type=types.Inverted)) + ds.add_column("col4", types.Text(index_type=types.BM25)) """ ... BM25: TextIndexType """ -A [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) based index of text data. +A BM25-based index of text data. -This index can be used with `BM25_SIMILARITY(column, 'search text')` in a TQL `ORDER BY` clause. +This index can be used with ``BM25_SIMILARITY(column, 'search text')`` in a TQL ``ORDER BY`` clause. + +See Also: + `BM25 Algorithm `_ """ Inverted: TextIndexType """ A text index that supports keyword lookup. -This index can be used with `CONTAINS(column, 'wanted_value')`. +This index can be used with ``CONTAINS(column, 'wanted_value')``. """ def Dict() -> Type: """ - Supports storing arbitrary key/value pairs in each row. + Creates a type that supports storing arbitrary key/value pairs in each row. + + Returns: + Type: A new dictionary data type. - See [deeplake.types.Struct][] for a type that supports defining allowed keys. + See Also: + :func:`deeplake.types.Struct` for a type that supports defining allowed keys. Examples: - ```python - ds.add_column("col1", types.Dict) + Create and use a dictionary column:: - ds.append([{"col1", {"a": 1, "b": 2}}]) - ds.append([{"col1", {"b": 3, "c": 4}}]) - ``` + ds.add_column("col1", types.Dict) + ds.append([{"col1": {"a": 1, "b": 2}}]) + ds.append([{"col1": {"b": 3, "c": 4}}]) """ - ... def Embedding( @@ -288,102 +401,135 @@ def Embedding( quantization: QuantizationType | None = None, ) -> Type: """ - A single-dimensional embedding of a given length. See [deeplake.types.Array][] for a multidimensional array. + Creates a single-dimensional embedding of a given length. Parameters: - size: The size of the embedding - dtype: The datatype of the embedding. Defaults to float32 - quantization: How to compress the embeddings in the index. Default uses no compression, but can be set to [deeplake.types.QuantizationType.Binary][] + size: int | None + The size of the embedding + dtype: DataType | str + The datatype of the embedding. Defaults to float32 + quantization: QuantizationType | None + How to compress the embeddings in the index. Default uses no compression, + but can be set to :class:`deeplake.types.QuantizationType.Binary` + + Returns: + Type: A new embedding data type. + + See Also: + :func:`deeplake.types.Array` for a multidimensional array. Examples: - ```python - ds.add_column("col1", types.Embedding(768)) - ds.add_column("col2", types.Embedding(768, quantization=types.QuantizationType.Binary)) - ``` + Create embedding columns:: + + ds.add_column("col1", types.Embedding(768)) + ds.add_column("col2", types.Embedding(768, quantization=types.QuantizationType.Binary)) """ ... def Float32() -> DataType: """ - A 32-bit float value + Creates a 32-bit float value type. + + Returns: + DataType: A new 32-bit float data type. Examples: - ```python - ds.add_column("col1", types.Float) - ``` + Create a column with 32-bit float type:: + + ds.add_column("col1", types.Float32) """ ... def Float64() -> DataType: """ - A 64-bit float value + Creates a 64-bit float value type. + + Returns: + DataType: A new 64-bit float data type. Examples: - ```python - ds.add_column("col1", types.Float64) - ``` + Create a column with 64-bit float type:: + + ds.add_column("col1", types.Float64) """ ... def Int16() -> DataType: """ - A 16-bit integer value + Creates a 16-bit integer value type. + + Returns: + DataType: A new 16-bit integer data type. Examples: - ```python - ds.add_column("col1", types.Int16) - ``` + Create a column with 16-bit integer type:: + + ds.add_column("col1", types.Int16) """ ... def Int32() -> DataType: """ - A 32-bit integer value + Creates a 32-bit integer value type. + + Returns: + DataType: A new 32-bit integer data type. Examples: - ```python - ds.add_column("col1", types.Int32) - ``` + Create a column with 32-bit integer type:: + + ds.add_column("col1", types.Int32) """ ... def Int64() -> DataType: """ - A 64-bit integer value + Creates a 64-bit integer value type. + + Returns: + DataType: A new 64-bit integer data type. Examples: - ```python - ds.add_column("col1", types.Int64) - ``` + Create a column with 64-bit integer type:: + + ds.add_column("col1", types.Int64) """ ... def Int8() -> DataType: """ - An 8-bit integer value + Creates an 8-bit integer value type. + + Returns: + DataType: A new 8-bit integer data type. Examples: - ```python - ds.add_column("col1", types.Int8) - ``` + Create a column with 8-bit integer type:: + + ds.add_column("col1", types.Int8) """ ... def Sequence(nested_type: DataType | str | Type) -> Type: """ - A sequence is a list of other data types, where there is a order to the values in the list. + Creates a sequence type that represents an ordered list of other data types. - For example, a video can be stored as a sequence of images to better capture the time-based ordering of the images rather than simply storing them as an Array + A sequence maintains the order of its values, making it suitable for time-series + data like videos (sequences of images). Parameters: - nested_type: The data type of the values in the sequence. Can be any data type, not just primitive types. + nested_type: DataType | str | Type + The data type of the values in the sequence. Can be any data type, + not just primitive types. + + Returns: + Type: A new sequence data type. Examples: - ```python - ds.add_column("col1", types.Sequence(types.Image(sample_compression="jpeg"))) - ``` + Create a sequence of images:: + + ds.add_column("col1", types.Sequence(types.Image(sample_ """ - ... def Image(dtype: DataType | str = "uint8", sample_compression: str = "png") -> Type: """ @@ -427,8 +573,12 @@ def Link(type: Type) -> Type: """ ... -def Polygon() -> Type: ... -def ClassLabel(dtype: DataType | str) -> Type: ... +def Polygon() -> Type: + ... + +def ClassLabel(dtype: DataType | str) -> Type: + ... + def BoundingBox( dtype: DataType | str = "float32", format: str | None = None, @@ -507,7 +657,7 @@ def Struct(fields: dict[str, DataType | str]) -> DataType: "field1": types.Int16(), "field2": types.Text(), })) - + ds.append([{"col1": {"field1": 3, "field2": "a"}}]) print(ds[0]["col1"]["field1"]) # Output: 3 ```