Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: Fix AstraDBCache docstrings #17802

Merged
merged 2 commits into from
Feb 20, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
105 changes: 41 additions & 64 deletions libs/community/langchain_community/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -1374,6 +1374,22 @@ class AstraDBCache(BaseCache):
- prompt, a string
- llm_string, a deterministic str representation of the model parameters.
(needed to prevent same-prompt-different-model collisions)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @cbornet, we follow google style documentation https://google.github.io/styleguide/pyguide.html#384-classes

Class level documentation is for documenting public attributes. I suspect that none of these attributes are meant to be used as public attributes, so documenting them only on the init makes the most sense to me.

What's the reason for making this change?

Copy link
Collaborator Author

@cbornet cbornet Feb 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, Sphinx default setup doesn't document the __init__ and vice-versa so it's a common usage to put the args in the class docstring.
If we look at https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google (see classes ExampleError and ExampleClass) both are allowed:

        The __init__ method may be documented in either the class level
        docstring, or as a docstring on the __init__ method itself.

        Either form is acceptable, but the two should not be mixed. Choose one
        convention to document the __init__ method and be consistent with it.

Also PyCharm will use the class doc if it doesn't find an __init__ doc. It's then nice to put a lot of info into the class and get that info when hovering a class instanciation.

That said, I see that the LangCHain setup does document the __init__ method separately and copies its content to the class doc (autoclass_content = "both").
So probably the best is to put most useful info into the __init__ and very basic headline or nothing in the class. This way, both PyCharm and LC docs will be happy.
I'll update the PRs.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done. PTAL.

Copy link
Collaborator

@eyurtsev eyurtsev Feb 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For those following from home: https://en.wiktionary.org/wiki/PTAL

Args:
collection_name: name of the Astra DB collection to create/use.
token: API token for Astra DB usage.
api_endpoint: full URL to the API endpoint,
such as `https://<DB-ID>-us-east1.apps.astra.datastax.com`.
astra_db_client: *alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AstraDB' instance.
async_astra_db_client: *alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AsyncAstraDB' instance.
namespace: namespace (aka keyspace) where the
collection is created. Defaults to the database's "default namespace".
setup_mode: mode used to create the Astra DB collection (SYNC, ASYNC or OFF).
pre_delete_collection: whether to delete the collection
before creating it. If False and the collection already exists,
the collection will be used as is.
"""

@staticmethod
Expand All @@ -1392,27 +1408,6 @@ def __init__(
pre_delete_collection: bool = False,
setup_mode: SetupMode = SetupMode.SYNC,
):
"""
Create an AstraDB cache using a collection for storage.

Args (only keyword-arguments accepted):
collection_name (str): name of the Astra DB collection to create/use.
token (Optional[str]): API token for Astra DB usage.
api_endpoint (Optional[str]): full URL to the API endpoint,
such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
astra_db_client (Optional[AstraDB]):
*alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AstraDB' instance.
async_astra_db_client (Optional[AsyncAstraDB]):
*alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AsyncAstraDB' instance.
namespace (Optional[str]): namespace (aka keyspace) where the
collection is created. Defaults to the database's "default namespace".
pre_delete_collection (bool): whether to delete and re-create the
collection. Defaults to False.
async_setup (bool): whether to create the collection asynchronously.
Enable only if there is a running asyncio event loop. Defaults to False.
"""
self.astra_env = _AstraDBCollectionEnvironment(
collection_name=collection_name,
token=token,
Expand All @@ -1427,7 +1422,6 @@ def __init__(
self.async_collection = self.astra_env.async_collection

def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
"""Look up based on prompt and llm_string."""
self.astra_env.ensure_db_setup()
doc_id = self._make_id(prompt, llm_string)
item = self.collection.find_one(
Expand All @@ -1441,7 +1435,6 @@ def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
return _loads_generations(item["body_blob"]) if item is not None else None

async def alookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
"""Look up based on prompt and llm_string."""
await self.astra_env.aensure_db_setup()
doc_id = self._make_id(prompt, llm_string)
item = (
Expand All @@ -1457,7 +1450,6 @@ async def alookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYP
return _loads_generations(item["body_blob"]) if item is not None else None

def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
"""Update cache based on prompt and llm_string."""
self.astra_env.ensure_db_setup()
doc_id = self._make_id(prompt, llm_string)
blob = _dumps_generations(return_val)
Expand All @@ -1471,7 +1463,6 @@ def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> N
async def aupdate(
self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE
) -> None:
"""Update cache based on prompt and llm_string."""
await self.astra_env.aensure_db_setup()
doc_id = self._make_id(prompt, llm_string)
blob = _dumps_generations(return_val)
Expand Down Expand Up @@ -1523,12 +1514,10 @@ async def adelete(self, prompt: str, llm_string: str) -> None:
await self.async_collection.delete_one(doc_id)

def clear(self, **kwargs: Any) -> None:
"""Clear cache. This is for all LLMs at once."""
self.astra_env.ensure_db_setup()
self.collection.clear()

async def aclear(self, **kwargs: Any) -> None:
"""Clear cache. This is for all LLMs at once."""
await self.astra_env.aensure_db_setup()
await self.async_collection.clear()

Expand Down Expand Up @@ -1583,8 +1572,31 @@ class AstraDBSemanticCache(BaseCache):
cached values from several LLMs, so the LLM's 'llm_string' is stored
in the document metadata.

You can choose the preferred similarity (or use the API default) --
remember the threshold might require metric-dependent tuning.
You can choose the preferred similarity (or use the API default).
The default score threshold is tuned to the default metric.
Tune it carefully yourself if switching to another distance metric.

Args:
collection_name: name of the Astra DB collection to create/use.
token: API token for Astra DB usage.
api_endpoint: full URL to the API endpoint,
such as `https://<DB-ID>-us-east1.apps.astra.datastax.com`.
astra_db_client: *alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AstraDB' instance.
async_astra_db_client: *alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AsyncAstraDB' instance.
namespace: namespace (aka keyspace) where the
collection is created. Defaults to the database's "default namespace".
setup_mode: mode used to create the Astra DB collection (SYNC, ASYNC or OFF).
pre_delete_collection: whether to delete the collection
before creating it. If False and the collection already exists,
the collection will be used as is.
embedding: Embedding provider for semantic
encoding and search.
metric: the function to use for evaluating similarity of text embeddings.
Defaults to 'cosine' (alternatives: 'euclidean', 'dot_product')
similarity_threshold: the minimum similarity
for accepting a (semantic-search) match.
"""

def __init__(
Expand All @@ -1602,35 +1614,6 @@ def __init__(
metric: Optional[str] = None,
similarity_threshold: float = ASTRA_DB_SEMANTIC_CACHE_DEFAULT_THRESHOLD,
):
"""
Initialize the cache with all relevant parameters.
Args:

collection_name (str): name of the Astra DB collection to create/use.
token (Optional[str]): API token for Astra DB usage.
api_endpoint (Optional[str]): full URL to the API endpoint,
such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
astra_db_client (Optional[AstraDB]): *alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AstraDB' instance.
async_astra_db_client (Optional[AsyncAstraDB]):
*alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AsyncAstraDB' instance.
namespace (Optional[str]): namespace (aka keyspace) where the
collection is created. Defaults to the database's "default namespace".
setup_mode (SetupMode): mode used to create the collection in the DB
(SYNC, ASYNC or OFF). Defaults to SYNC.
pre_delete_collection (bool): whether to delete and re-create the
collection. Defaults to False.
embedding (Embedding): Embedding provider for semantic
encoding and search.
metric: the function to use for evaluating similarity of text embeddings.
Defaults to 'cosine' (alternatives: 'euclidean', 'dot_product')
similarity_threshold (float, optional): the minimum similarity
for accepting a (semantic-search) match.

The default score threshold is tuned to the default metric.
Tune it carefully yourself if switching to another distance metric.
"""
self.embedding = embedding
self.metric = metric
self.similarity_threshold = similarity_threshold
Expand Down Expand Up @@ -1685,7 +1668,6 @@ def _make_id(prompt: str, llm_string: str) -> str:
return f"{_hash(prompt)}#{_hash(llm_string)}"

def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
"""Update cache based on prompt and llm_string."""
self.astra_env.ensure_db_setup()
doc_id = self._make_id(prompt, llm_string)
llm_string_hash = _hash(llm_string)
Expand All @@ -1704,7 +1686,6 @@ def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> N
async def aupdate(
self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE
) -> None:
"""Update cache based on prompt and llm_string."""
await self.astra_env.aensure_db_setup()
doc_id = self._make_id(prompt, llm_string)
llm_string_hash = _hash(llm_string)
Expand All @@ -1721,15 +1702,13 @@ async def aupdate(
)

def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
"""Look up based on prompt and llm_string."""
hit_with_id = self.lookup_with_id(prompt, llm_string)
if hit_with_id is not None:
return hit_with_id[1]
else:
return None

async def alookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
"""Look up based on prompt and llm_string."""
hit_with_id = await self.alookup_with_id(prompt, llm_string)
if hit_with_id is not None:
return hit_with_id[1]
Expand Down Expand Up @@ -1835,11 +1814,9 @@ async def adelete_by_document_id(self, document_id: str) -> None:
await self.async_collection.delete_one(document_id)

def clear(self, **kwargs: Any) -> None:
"""Clear the *whole* semantic cache."""
self.astra_env.ensure_db_setup()
self.collection.clear()

async def aclear(self, **kwargs: Any) -> None:
"""Clear the *whole* semantic cache."""
await self.astra_env.aensure_db_setup()
await self.async_collection.clear()