Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

List vector ids by prefix #307

Merged
merged 4 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
47 changes: 47 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,53 @@ update_response = index.update(
)
```

## List vectors

The `list` and `list_paginated` methods can be used to list vector ids matching a particular id prefix.
With clever assignment of vector ids, this can be used to help model hierarchical relationships between
different vectors such as when there are embeddings for multiple chunks or fragments related to the
same document.

The `list` method returns a generator that handles pagination on your behalf.

```python
from pinecone import Pinecone

pc = Pinecone(api_key='xxx')
index = pc.Index(host='hosturl')

# To iterate over all result pages using a generator function
namespace = 'foo-namespace'
for ids in index.list(prefix='pref', limit=3, namespace=namespace):
print(ids) # ['pref1', 'pref2', 'pref3']

# Now you can pass this id array to other methods, such as fetch or delete.
vectors = index.fetch(ids=ids, namespace=namespace)
```

There is also an option to fetch each page of results yourself with `list_paginated`.

```python
from pinecone import Pinecone

pc = Pinecone(api_key='xxx')
index = pc.Index(host='hosturl')

# For manual control over pagination
results = index.list_paginated(
prefix='pref',
limit=3,
namespace='foo',
pagination_token='eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
)
print(results.namespace) # 'foo'
print([v.id for v in results.vectors]) # ['pref1', 'pref2', 'pref3']
print(results.pagination.next) # 'eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
print(results.usage) # { 'read_units': 1 }
```

# Collections

## Create collection

The following example creates the collection `example-collection` from
Expand Down
81 changes: 79 additions & 2 deletions pinecone/data/index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from tqdm.autonotebook import tqdm

from collections.abc import Iterable
from typing import Union, List, Tuple, Optional, Dict, Any

from pinecone.config import ConfigBuilder
Expand All @@ -22,9 +21,10 @@
DeleteRequest,
UpdateRequest,
DescribeIndexStatsRequest,
ListResponse
)
from pinecone.core.client.api.data_plane_api import DataPlaneApi
from ..utils import get_user_agent, fix_tuple_length
from ..utils import get_user_agent
from .vector_factory import VectorFactory

__all__ = [
Expand Down Expand Up @@ -502,6 +502,83 @@ def describe_index_stats(
),
**{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS},
)

@validate_and_convert_errors
def list_paginated(
self,
prefix: Optional[str] = None,
limit: Optional[int] = None,
pagination_token: Optional[str] = None,
namespace: Optional[str] = None,
**kwargs
) -> ListResponse:
"""
The list_paginated operation finds vectors based on an id prefix within a single namespace.
It returns matching ids in a paginated form, with a pagination token to fetch the next page of results.
This id list can then be passed to fetch or delete operations, depending on your use case.

Consider using the `list` method to avoid having to handle pagination tokens manually.

Examples:
>>> results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace')
>>> [v.id for v in results.vectors]
['99', '990', '991', '992', '993']
>>> results.pagination.next
eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9
>>> next_results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace', pagination_token=results.pagination.next)

Args:
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
be used with the effect of listing all ids in a namespace [optional]
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
in the response if additional results are available. [optional]
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]

Returns: ListResponse object which contains the list of ids, the namespace name, pagination information, and usage showing the number of read_units consumed.
"""
args_dict = self._parse_non_empty_args(
[
("prefix", prefix),
("limit", limit),
("namespace", namespace),
("pagination_token", pagination_token),
]
)
return self._vector_api.list(**args_dict, **kwargs)

@validate_and_convert_errors
def list(self, **kwargs):
"""
The list operation accepts all of the same arguments as list_paginated, and returns a generator that yields
a list of the matching vector ids in each page of results. It automatically handles pagination tokens on your
behalf.

Examples:
>>> for ids in index.list(prefix='99', limit=5, namespace='my_namespace'):
>>> print(ids)
['99', '990', '991', '992', '993']
['994', '995', '996', '997', '998']
['999']

Args:
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
be used with the effect of listing all ids in a namespace [optional]
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
in the response if additional results are available. [optional]
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
"""
done = False
while not done:
results = self.list_paginated(**kwargs)
if len(results.vectors) > 0:
yield [v.id for v in results.vectors]

if results.pagination:
kwargs.update({"pagination_token": results.pagination.next})
else:
done = True

@staticmethod
def _parse_non_empty_args(args: List[Tuple[str, Any]]) -> Dict[str, Any]:
Expand Down
99 changes: 98 additions & 1 deletion pinecone/grpc/index_grpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
QueryResponse,
DescribeIndexStatsResponse,
)
from pinecone.models.list_response import (
ListResponse as SimpleListResponse,
Pagination
)
from pinecone.core.grpc.protos.vector_service_pb2 import (
Vector as GRPCVector,
QueryVector as GRPCQueryVector,
Expand All @@ -22,6 +26,8 @@
QueryRequest,
FetchRequest,
UpdateRequest,
ListRequest,
ListResponse,
DescribeIndexStatsRequest,
DeleteResponse,
UpdateResponse,
Expand All @@ -41,7 +47,6 @@ class SparseVectorTypedDict(TypedDict):
indices: List[int]
values: List[float]


class GRPCIndex(GRPCIndexBase):
"""A client for interacting with a Pinecone index via GRPC API."""

Expand Down Expand Up @@ -429,6 +434,98 @@ def update(
else:
return self._wrap_grpc_call(self.stub.Update, request, timeout=timeout)

def list_paginated(
self,
prefix: Optional[str] = None,
limit: Optional[int] = None,
pagination_token: Optional[str] = None,
namespace: Optional[str] = None,
**kwargs
) -> SimpleListResponse:
"""
The list_paginated operation finds vectors based on an id prefix within a single namespace.
It returns matching ids in a paginated form, with a pagination token to fetch the next page of results.
This id list can then be passed to fetch or delete operations, depending on your use case.

Consider using the `list` method to avoid having to handle pagination tokens manually.

Examples:
>>> results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace')
>>> [v.id for v in results.vectors]
['99', '990', '991', '992', '993']
>>> results.pagination.next
eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9
>>> next_results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace', pagination_token=results.pagination.next)

Args:
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
be used with the effect of listing all ids in a namespace [optional]
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
in the response if additional results are available. [optional]
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]

Returns: SimpleListResponse object which contains the list of ids, the namespace name, pagination information, and usage showing the number of read_units consumed.
"""
args_dict = self._parse_non_empty_args(
[
("prefix", prefix),
("limit", limit),
("namespace", namespace),
("pagination_token", pagination_token),
]
)
request = ListRequest(**args_dict, **kwargs)
timeout = kwargs.pop("timeout", None)
response = self._wrap_grpc_call(self.stub.List, request, timeout=timeout)

if response.pagination and response.pagination.next != '':
pagination = Pagination(next=response.pagination.next)
else:
pagination = None

return SimpleListResponse(
namespace=response.namespace,
vectors=response.vectors,
pagination=pagination,
)

def list(self, **kwargs):
"""
The list operation accepts all of the same arguments as list_paginated, and returns a generator that yields
a list of the matching vector ids in each page of results. It automatically handles pagination tokens on your
behalf.

Examples:
>>> for ids in index.list(prefix='99', limit=5, namespace='my_namespace'):
>>> print(ids)
['99', '990', '991', '992', '993']
['994', '995', '996', '997', '998']
['999']

Args:
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
be used with the effect of listing all ids in a namespace [optional]
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
in the response if additional results are available. [optional]
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
"""
done = False
while not done:
try:
results = self.list_paginated(**kwargs)
except Exception as e:
raise e

if len(results.vectors) > 0:
yield [v.id for v in results.vectors]

if results.pagination and results.pagination.next:
kwargs.update({"pagination_token": results.pagination.next})
else:
done = True

def describe_index_stats(
self, filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, **kwargs
) -> DescribeIndexStatsResponse:
Expand Down
9 changes: 9 additions & 0 deletions pinecone/models/list_response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import NamedTuple, Optional, List

class Pagination(NamedTuple):
next: str

class ListResponse(NamedTuple):
namespace: str
vectors: List
pagination: Optional[Pagination]
30 changes: 20 additions & 10 deletions tests/integration/data/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
import json
from ..helpers import get_environment_var, random_string
from .seed import setup_data
from .seed import setup_data, setup_list_data

# Test matrix needs to consider the following dimensions:
# - pod vs serverless
Expand Down Expand Up @@ -41,14 +41,18 @@ def spec():

@pytest.fixture(scope='session')
def index_name():
# return 'dataplane-lol'
return 'dataplane-' + random_string(20)

@pytest.fixture(scope='session')
def namespace():
# return 'banana'
return random_string(10)

@pytest.fixture(scope='session')
def list_namespace():
# return 'list-banana'
return random_string(10)

@pytest.fixture(scope='session')
def idx(client, index_name, index_host):
return client.Index(name=index_name, host=index_host)
Expand All @@ -57,27 +61,33 @@ def idx(client, index_name, index_host):
def index_host(index_name, metric, spec):
pc = build_client()
print('Creating index with name: ' + index_name)
pc.create_index(
name=index_name,
dimension=2,
metric=metric,
spec=spec
)
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=2,
metric=metric,
spec=spec
)
description = pc.describe_index(name=index_name)
yield description.host

print('Deleting index with name: ' + index_name)
pc.delete_index(index_name, -1)

@pytest.fixture(scope='session', autouse=True)
def seed_data(idx, namespace, index_host):
def seed_data(idx, namespace, index_host, list_namespace):
print('Seeding data in host ' + index_host)

print('Seeding list data in namespace "' + list_namespace + '"')
setup_list_data(idx, list_namespace, True)

print('Seeding data in namespace "' + namespace + '"')
setup_data(idx, namespace, False)

print('Seeding data in namespace ""')
setup_data(idx, '', True)

print('Waiting a bit more to ensure freshness')
time.sleep(60)
time.sleep(120)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm assuming this doubling of the sleep time here is intentional due to freshness concerns around the larger number of records.


yield
12 changes: 12 additions & 0 deletions tests/integration/data/seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,15 @@ def setup_data(idx, target_namespace, wait):

if wait:
poll_fetch_for_ids_in_namespace(idx, ids=['1', '2', '3', '4', '5', '6', '7', '8', '9'], namespace=target_namespace)

def setup_list_data(idx, target_namespace, wait):
# Upsert a bunch more stuff for testing list pagination
for i in range(0, 1000, 50):
idx.upsert(vectors=[
(str(i+d), embedding_values(2)) for d in range(50)
],
namespace=target_namespace
)

if wait:
poll_fetch_for_ids_in_namespace(idx, ids=['999'], namespace=target_namespace)