Skip to content

Commit 2ca1eb9

Browse files
authored
List vector ids by prefix (#307)
## Problem Need to implement the new data plane list endpoint. ## Solution - Update generated code. Pull in list endpoint. All changes under `pinecone/core` are generated from spec files and can be ignored for this review. - Implement changes for list endpoint in: - `pinecone/data/index.py` main implementation - `pinecone/grpc/index_grpc.py` main implementation - `tests/integration/data/conftest.py` adjustments to test setup, mainly to generate a new namespace to hold vectors for list testing data. - `tests/integration/data/seed.py` to upsert a larger number of vectors, so I would have enough data to page through - `tests/integration/data/test_list.py` - `tests/integration/data/test_list_errors.py` ## Open questions - Do we expect to ever return more data than just `{'id': '1'}` in the vectors array? For convenience the `list()` method is currently implemented as a generator function that abstracts the pagination steps and yields a flat list of id values. For a use case where you were going to immediately fetch those ids, this seems ideal. But would be limiting if we ever wanted to return more than just ids here. ## Usage Install the dev client version `install "pinecone-client[grpc]"==3.1.0.dev1` ### REST ```python from pinecone import Pinecone pc = Pinecone(api_key='xxx') index = pc.Index(host='hosturl') # To iterate over all result pages using a generator function for ids in index.list(prefix='pref', limit=3, namespace='foo'): print(ids) // ['pref1', 'pref2', 'pref3'] # For manual control over pagination results = index.list_paginated( prefix='pref', limit=3, namespace='foo', pagination_token='eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ==' ) print(results.namespace) print([v.id for v in results.vectors]) print(results.pagination.next) print(results.usage) ``` ### GRPC ```python from pinecone.grpc import PineconeGRPC pc = PineconeGRPC(api_key='xxx') index = pc.Index(host='hosturl') # To iterate over all result pages using a generator function for ids in index.list(prefix='pref', limit=3, namespace='foo'): print(ids) // ['pref1', 'pref2', 'pref3'] # For manual control over pagination results = index.list_paginated( prefix='pref', limit=3, namespace='foo', pagination_token='eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ==' ) print(results.namespace) print([v.id for v in results.vectors]) print(results.pagination.next) print(results.usage) ``` ## Type of Change - [x] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) ## Testing Try out the dev version. ``` pip install "pinecone-client[grpc]"==3.1.0.dev1 ```
1 parent 1235fbd commit 2ca1eb9

File tree

8 files changed

+394
-13
lines changed

8 files changed

+394
-13
lines changed

README.md

+47
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,53 @@ update_response = index.update(
339339
)
340340
```
341341

342+
## List vectors
343+
344+
The `list` and `list_paginated` methods can be used to list vector ids matching a particular id prefix.
345+
With clever assignment of vector ids, this can be used to help model hierarchical relationships between
346+
different vectors such as when there are embeddings for multiple chunks or fragments related to the
347+
same document.
348+
349+
The `list` method returns a generator that handles pagination on your behalf.
350+
351+
```python
352+
from pinecone import Pinecone
353+
354+
pc = Pinecone(api_key='xxx')
355+
index = pc.Index(host='hosturl')
356+
357+
# To iterate over all result pages using a generator function
358+
namespace = 'foo-namespace'
359+
for ids in index.list(prefix='pref', limit=3, namespace=namespace):
360+
print(ids) # ['pref1', 'pref2', 'pref3']
361+
362+
# Now you can pass this id array to other methods, such as fetch or delete.
363+
vectors = index.fetch(ids=ids, namespace=namespace)
364+
```
365+
366+
There is also an option to fetch each page of results yourself with `list_paginated`.
367+
368+
```python
369+
from pinecone import Pinecone
370+
371+
pc = Pinecone(api_key='xxx')
372+
index = pc.Index(host='hosturl')
373+
374+
# For manual control over pagination
375+
results = index.list_paginated(
376+
prefix='pref',
377+
limit=3,
378+
namespace='foo',
379+
pagination_token='eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
380+
)
381+
print(results.namespace) # 'foo'
382+
print([v.id for v in results.vectors]) # ['pref1', 'pref2', 'pref3']
383+
print(results.pagination.next) # 'eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
384+
print(results.usage) # { 'read_units': 1 }
385+
```
386+
387+
# Collections
388+
342389
## Create collection
343390

344391
The following example creates the collection `example-collection` from

pinecone/data/index.py

+79-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from tqdm.autonotebook import tqdm
22

3-
from collections.abc import Iterable
43
from typing import Union, List, Tuple, Optional, Dict, Any
54

65
from pinecone.config import ConfigBuilder
@@ -22,9 +21,10 @@
2221
DeleteRequest,
2322
UpdateRequest,
2423
DescribeIndexStatsRequest,
24+
ListResponse
2525
)
2626
from pinecone.core.client.api.data_plane_api import DataPlaneApi
27-
from ..utils import get_user_agent, fix_tuple_length
27+
from ..utils import get_user_agent
2828
from .vector_factory import VectorFactory
2929

3030
__all__ = [
@@ -502,6 +502,83 @@ def describe_index_stats(
502502
),
503503
**{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS},
504504
)
505+
506+
@validate_and_convert_errors
507+
def list_paginated(
508+
self,
509+
prefix: Optional[str] = None,
510+
limit: Optional[int] = None,
511+
pagination_token: Optional[str] = None,
512+
namespace: Optional[str] = None,
513+
**kwargs
514+
) -> ListResponse:
515+
"""
516+
The list_paginated operation finds vectors based on an id prefix within a single namespace.
517+
It returns matching ids in a paginated form, with a pagination token to fetch the next page of results.
518+
This id list can then be passed to fetch or delete operations, depending on your use case.
519+
520+
Consider using the `list` method to avoid having to handle pagination tokens manually.
521+
522+
Examples:
523+
>>> results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace')
524+
>>> [v.id for v in results.vectors]
525+
['99', '990', '991', '992', '993']
526+
>>> results.pagination.next
527+
eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9
528+
>>> next_results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace', pagination_token=results.pagination.next)
529+
530+
Args:
531+
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
532+
be used with the effect of listing all ids in a namespace [optional]
533+
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
534+
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
535+
in the response if additional results are available. [optional]
536+
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
537+
538+
Returns: ListResponse object which contains the list of ids, the namespace name, pagination information, and usage showing the number of read_units consumed.
539+
"""
540+
args_dict = self._parse_non_empty_args(
541+
[
542+
("prefix", prefix),
543+
("limit", limit),
544+
("namespace", namespace),
545+
("pagination_token", pagination_token),
546+
]
547+
)
548+
return self._vector_api.list(**args_dict, **kwargs)
549+
550+
@validate_and_convert_errors
551+
def list(self, **kwargs):
552+
"""
553+
The list operation accepts all of the same arguments as list_paginated, and returns a generator that yields
554+
a list of the matching vector ids in each page of results. It automatically handles pagination tokens on your
555+
behalf.
556+
557+
Examples:
558+
>>> for ids in index.list(prefix='99', limit=5, namespace='my_namespace'):
559+
>>> print(ids)
560+
['99', '990', '991', '992', '993']
561+
['994', '995', '996', '997', '998']
562+
['999']
563+
564+
Args:
565+
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
566+
be used with the effect of listing all ids in a namespace [optional]
567+
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
568+
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
569+
in the response if additional results are available. [optional]
570+
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
571+
"""
572+
done = False
573+
while not done:
574+
results = self.list_paginated(**kwargs)
575+
if len(results.vectors) > 0:
576+
yield [v.id for v in results.vectors]
577+
578+
if results.pagination:
579+
kwargs.update({"pagination_token": results.pagination.next})
580+
else:
581+
done = True
505582

506583
@staticmethod
507584
def _parse_non_empty_args(args: List[Tuple[str, Any]]) -> Dict[str, Any]:

pinecone/grpc/index_grpc.py

+98-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
QueryResponse,
1414
DescribeIndexStatsResponse,
1515
)
16+
from pinecone.models.list_response import (
17+
ListResponse as SimpleListResponse,
18+
Pagination
19+
)
1620
from pinecone.core.grpc.protos.vector_service_pb2 import (
1721
Vector as GRPCVector,
1822
QueryVector as GRPCQueryVector,
@@ -22,6 +26,8 @@
2226
QueryRequest,
2327
FetchRequest,
2428
UpdateRequest,
29+
ListRequest,
30+
ListResponse,
2531
DescribeIndexStatsRequest,
2632
DeleteResponse,
2733
UpdateResponse,
@@ -41,7 +47,6 @@ class SparseVectorTypedDict(TypedDict):
4147
indices: List[int]
4248
values: List[float]
4349

44-
4550
class GRPCIndex(GRPCIndexBase):
4651
"""A client for interacting with a Pinecone index via GRPC API."""
4752

@@ -429,6 +434,98 @@ def update(
429434
else:
430435
return self._wrap_grpc_call(self.stub.Update, request, timeout=timeout)
431436

437+
def list_paginated(
438+
self,
439+
prefix: Optional[str] = None,
440+
limit: Optional[int] = None,
441+
pagination_token: Optional[str] = None,
442+
namespace: Optional[str] = None,
443+
**kwargs
444+
) -> SimpleListResponse:
445+
"""
446+
The list_paginated operation finds vectors based on an id prefix within a single namespace.
447+
It returns matching ids in a paginated form, with a pagination token to fetch the next page of results.
448+
This id list can then be passed to fetch or delete operations, depending on your use case.
449+
450+
Consider using the `list` method to avoid having to handle pagination tokens manually.
451+
452+
Examples:
453+
>>> results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace')
454+
>>> [v.id for v in results.vectors]
455+
['99', '990', '991', '992', '993']
456+
>>> results.pagination.next
457+
eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9
458+
>>> next_results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace', pagination_token=results.pagination.next)
459+
460+
Args:
461+
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
462+
be used with the effect of listing all ids in a namespace [optional]
463+
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
464+
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
465+
in the response if additional results are available. [optional]
466+
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
467+
468+
Returns: SimpleListResponse object which contains the list of ids, the namespace name, pagination information, and usage showing the number of read_units consumed.
469+
"""
470+
args_dict = self._parse_non_empty_args(
471+
[
472+
("prefix", prefix),
473+
("limit", limit),
474+
("namespace", namespace),
475+
("pagination_token", pagination_token),
476+
]
477+
)
478+
request = ListRequest(**args_dict, **kwargs)
479+
timeout = kwargs.pop("timeout", None)
480+
response = self._wrap_grpc_call(self.stub.List, request, timeout=timeout)
481+
482+
if response.pagination and response.pagination.next != '':
483+
pagination = Pagination(next=response.pagination.next)
484+
else:
485+
pagination = None
486+
487+
return SimpleListResponse(
488+
namespace=response.namespace,
489+
vectors=response.vectors,
490+
pagination=pagination,
491+
)
492+
493+
def list(self, **kwargs):
494+
"""
495+
The list operation accepts all of the same arguments as list_paginated, and returns a generator that yields
496+
a list of the matching vector ids in each page of results. It automatically handles pagination tokens on your
497+
behalf.
498+
499+
Examples:
500+
>>> for ids in index.list(prefix='99', limit=5, namespace='my_namespace'):
501+
>>> print(ids)
502+
['99', '990', '991', '992', '993']
503+
['994', '995', '996', '997', '998']
504+
['999']
505+
506+
Args:
507+
prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will
508+
be used with the effect of listing all ids in a namespace [optional]
509+
limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
510+
pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned
511+
in the response if additional results are available. [optional]
512+
namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
513+
"""
514+
done = False
515+
while not done:
516+
try:
517+
results = self.list_paginated(**kwargs)
518+
except Exception as e:
519+
raise e
520+
521+
if len(results.vectors) > 0:
522+
yield [v.id for v in results.vectors]
523+
524+
if results.pagination and results.pagination.next:
525+
kwargs.update({"pagination_token": results.pagination.next})
526+
else:
527+
done = True
528+
432529
def describe_index_stats(
433530
self, filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, **kwargs
434531
) -> DescribeIndexStatsResponse:

pinecone/models/list_response.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from typing import NamedTuple, Optional, List
2+
3+
class Pagination(NamedTuple):
4+
next: str
5+
6+
class ListResponse(NamedTuple):
7+
namespace: str
8+
vectors: List
9+
pagination: Optional[Pagination]

tests/integration/data/conftest.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import time
44
import json
55
from ..helpers import get_environment_var, random_string
6-
from .seed import setup_data
6+
from .seed import setup_data, setup_list_data
77

88
# Test matrix needs to consider the following dimensions:
99
# - pod vs serverless
@@ -41,14 +41,18 @@ def spec():
4141

4242
@pytest.fixture(scope='session')
4343
def index_name():
44-
# return 'dataplane-lol'
4544
return 'dataplane-' + random_string(20)
4645

4746
@pytest.fixture(scope='session')
4847
def namespace():
4948
# return 'banana'
5049
return random_string(10)
5150

51+
@pytest.fixture(scope='session')
52+
def list_namespace():
53+
# return 'list-banana'
54+
return random_string(10)
55+
5256
@pytest.fixture(scope='session')
5357
def idx(client, index_name, index_host):
5458
return client.Index(name=index_name, host=index_host)
@@ -57,27 +61,33 @@ def idx(client, index_name, index_host):
5761
def index_host(index_name, metric, spec):
5862
pc = build_client()
5963
print('Creating index with name: ' + index_name)
60-
pc.create_index(
61-
name=index_name,
62-
dimension=2,
63-
metric=metric,
64-
spec=spec
65-
)
64+
if index_name not in pc.list_indexes().names():
65+
pc.create_index(
66+
name=index_name,
67+
dimension=2,
68+
metric=metric,
69+
spec=spec
70+
)
6671
description = pc.describe_index(name=index_name)
6772
yield description.host
73+
6874
print('Deleting index with name: ' + index_name)
6975
pc.delete_index(index_name, -1)
7076

7177
@pytest.fixture(scope='session', autouse=True)
72-
def seed_data(idx, namespace, index_host):
78+
def seed_data(idx, namespace, index_host, list_namespace):
7379
print('Seeding data in host ' + index_host)
7480

81+
print('Seeding list data in namespace "' + list_namespace + '"')
82+
setup_list_data(idx, list_namespace, True)
83+
7584
print('Seeding data in namespace "' + namespace + '"')
7685
setup_data(idx, namespace, False)
7786

7887
print('Seeding data in namespace ""')
7988
setup_data(idx, '', True)
8089

8190
print('Waiting a bit more to ensure freshness')
82-
time.sleep(60)
91+
time.sleep(120)
92+
8393
yield

tests/integration/data/seed.py

+12
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,15 @@ def setup_data(idx, target_namespace, wait):
3232

3333
if wait:
3434
poll_fetch_for_ids_in_namespace(idx, ids=['1', '2', '3', '4', '5', '6', '7', '8', '9'], namespace=target_namespace)
35+
36+
def setup_list_data(idx, target_namespace, wait):
37+
# Upsert a bunch more stuff for testing list pagination
38+
for i in range(0, 1000, 50):
39+
idx.upsert(vectors=[
40+
(str(i+d), embedding_values(2)) for d in range(50)
41+
],
42+
namespace=target_namespace
43+
)
44+
45+
if wait:
46+
poll_fetch_for_ids_in_namespace(idx, ids=['999'], namespace=target_namespace)

0 commit comments

Comments
 (0)