langchain-ai · baskaryan · Feb 21, 2024 · Feb 7, 2024 · Feb 16, 2024
diff --git a/libs/community/langchain_community/vectorstores/deeplake.py b/libs/community/langchain_community/vectorstores/deeplake.py
@@ -60,7 +60,7 @@
         embedding: Optional[Embeddings] = None,
         embedding_function: Optional[Embeddings] = None,
         read_only: bool = False,
-        ingestion_batch_size: int = 1000,
+        ingestion_batch_size: int = 1024,
         num_workers: int = 0,
         verbose: bool = True,
         exec_option: Optional[str] = None,
@@ -85,8 +85,12 @@
             ... )
 
         Args:
-            dataset_path (str): Path to existing dataset or where to create
-                a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
+            dataset_path (str): The full path for storing to the Deep Lake Vector Store. It can be:
+                - a Deep Lake cloud path of the form ``hub://org_id/dataset_name``. Requires registration with Deep Lake.
+                - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
+                - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
+                - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
+                 Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
             token (str, optional):  Activeloop token, for fetching credentials
                 to the dataset at path if it is a Deep Lake dataset.
                 Tokens are normally autogenerated. Optional.
@@ -98,26 +102,16 @@
             read_only (bool): Open dataset in read-only mode. Default is False.
             ingestion_batch_size (int): During data ingestion, data is divided
                 into batches. Batch size is the size of each batch.
-                Default is 1000.
+                Default is 1024.
             num_workers (int): Number of workers to use during data ingestion.
                 Default is 0.
             verbose (bool): Print dataset summary after each operation.
                 Default is True.
-            exec_option (str, optional): DeepLakeVectorStore supports 3 ways to perform
-                searching - "python", "compute_engine", "tensor_db" and auto.
-                Default is None.
-                - ``auto``- Selects the best execution method based on the storage
-                    location of the Vector Store. It is the default option.
-                - ``python`` - Pure-python implementation that runs on the client.
-                    WARNING: using this with big datasets can lead to memory
-                    issues. Data can be stored anywhere.
-                - ``compute_engine`` - C++ implementation of the Deep Lake Compute
-                    Engine that runs on the client. Can be used for any data stored in
-                    or connected to Deep Lake. Not for in-memory or local datasets.
-                - ``tensor_db`` - Hosted Managed Tensor Database that is
-                    responsible for storage and query execution. Only for data stored in
-                    the Deep Lake Managed Database. Use runtime = {"db_engine": True}
-                    during dataset creation.
+            exec_option (str, optional): Default method for search execution. It could be either ``"auto"``, ``"python"``, ``"compute_engine"`` or ``"tensor_db"``. Defaults to ``"auto"``. If None, it's set to "auto".
+                - ``auto``- Selects the best execution method based on the storage location of the Vector Store. It is the default option.
+                - ``python`` - Pure-python implementation that runs on the client and can be used for data stored anywhere. WARNING: using this option with big datasets is discouraged because it can lead to memory issues.
+                - ``compute_engine`` - Performant C++ implementation of the Deep Lake Compute Engine that runs on the client and can be used for any data stored in or connected to Deep Lake. It cannot be used with in-memory or local datasets.
+                - ``tensor_db`` - Performant and fully-hosted Managed Tensor Database that is responsible for storage and query execution. Only available for data stored in the Deep Lake Managed Database. Store datasets in this database by specifying runtime = {"tensor_db": True} during dataset creation.
             runtime (Dict, optional): Parameters for creating the Vector Store in
                 Deep Lake's Managed Tensor Database. Not applicable when loading an
                 existing Vector Store. To create a Vector Store in the Managed Tensor