List vector ids by prefix (#307)

jhamon · web-flow · commit 2ca1eb99b9f5 · 2024-02-23T18:04:24.000-05:00
## Problem

Need to implement the new data plane list endpoint.

## Solution

- Update generated code. Pull in list endpoint. All changes under
`pinecone/core` are generated from spec files and can be ignored for
this review.
- Implement changes for list endpoint in:
  - `pinecone/data/index.py` main implementation
  - `pinecone/grpc/index_grpc.py` main implementation
- `tests/integration/data/conftest.py` adjustments to test setup, mainly
to generate a new namespace to hold vectors for list testing data.
- `tests/integration/data/seed.py` to upsert a larger number of vectors,
so I would have enough data to page through
  - `tests/integration/data/test_list.py`
  -  `tests/integration/data/test_list_errors.py`

## Open questions

- Do we expect to ever return more data than just `{'id': '1'}` in the
vectors array? For convenience the `list()` method is currently
implemented as a generator function that abstracts the pagination steps
and yields a flat list of id values. For a use case where you were going
to immediately fetch those ids, this seems ideal. But would be limiting
if we ever wanted to return more than just ids here.
 
## Usage

Install the dev client version `install
"pinecone-client[grpc]"==3.1.0.dev1`

### REST

```python
from pinecone import Pinecone

pc = Pinecone(api_key='xxx')
index = pc.Index(host='hosturl')

# To iterate over all result pages using a generator function
for ids in index.list(prefix='pref', limit=3, namespace='foo'):
    print(ids) // ['pref1', 'pref2', 'pref3']

# For manual control over pagination
results = index.list_paginated(
    prefix='pref', 
    limit=3, 
    namespace='foo', 
    pagination_token='eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
)
print(results.namespace)
print([v.id for v in results.vectors])
print(results.pagination.next)
print(results.usage)
```

### GRPC

```python
from pinecone.grpc import PineconeGRPC

pc = PineconeGRPC(api_key='xxx')
index = pc.Index(host='hosturl')

# To iterate over all result pages using a generator function
for ids in index.list(prefix='pref', limit=3, namespace='foo'):
    print(ids) // ['pref1', 'pref2', 'pref3']

# For manual control over pagination
results = index.list_paginated(
    prefix='pref', 
    limit=3, 
    namespace='foo', 
    pagination_token='eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
)
print(results.namespace)
print([v.id for v in results.vectors])
print(results.pagination.next)
print(results.usage)
```

## Type of Change

- [x] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to not work as expected)

## Testing

Try out the dev version.

```
pip install "pinecone-client[grpc]"==3.1.0.dev1
```
diff --git a/README.md b/README.md
@@ -339,6 +339,53 @@ update_response = index.update(
 )
 ```
 
+## List vectors
+
+The `list` and `list_paginated` methods can be used to list vector ids matching a particular id prefix. 
+With clever assignment of vector ids, this can be used to help model hierarchical relationships between
+different vectors such as when there are embeddings for multiple chunks or fragments related to the 
+same document.
+
+The `list` method returns a generator that handles pagination on your behalf.
+
+```python
+from pinecone import Pinecone
+
+pc = Pinecone(api_key='xxx')
+index = pc.Index(host='hosturl')
+
+# To iterate over all result pages using a generator function
+namespace = 'foo-namespace'
+for ids in index.list(prefix='pref', limit=3, namespace=namespace):
+    print(ids) # ['pref1', 'pref2', 'pref3']
+
+    # Now you can pass this id array to other methods, such as fetch or delete.
+    vectors = index.fetch(ids=ids, namespace=namespace)
+```
+
+There is also an option to fetch each page of results yourself with `list_paginated`.
+
+```python
+from pinecone import Pinecone
+
+pc = Pinecone(api_key='xxx')
+index = pc.Index(host='hosturl')
+
+# For manual control over pagination
+results = index.list_paginated(
+    prefix='pref',
+    limit=3,
+    namespace='foo',
+    pagination_token='eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
+)
+print(results.namespace) # 'foo'
+print([v.id for v in results.vectors]) # ['pref1', 'pref2', 'pref3']
+print(results.pagination.next) # 'eyJza2lwX3Bhc3QiOiI5IiwicHJlZml4IjpudWxsfQ=='
+print(results.usage) # { 'read_units': 1 }
+```
+
+# Collections
+
 ## Create collection
 
 The following example creates the collection `example-collection` from
diff --git a/pinecone/data/index.py b/pinecone/data/index.py
@@ -1,6 +1,5 @@
 from tqdm.autonotebook import tqdm
 
-from collections.abc import Iterable
 from typing import Union, List, Tuple, Optional, Dict, Any
 
 from pinecone.config import ConfigBuilder
@@ -22,9 +21,10 @@
     DeleteRequest,
     UpdateRequest,
     DescribeIndexStatsRequest,
+    ListResponse
 )
 from pinecone.core.client.api.data_plane_api import DataPlaneApi
-from ..utils import get_user_agent, fix_tuple_length
+from ..utils import get_user_agent
 from .vector_factory import VectorFactory
 
 __all__ = [
@@ -502,6 +502,83 @@ def describe_index_stats(
             ),
             **{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS},
         )
+    
+    @validate_and_convert_errors
+    def list_paginated(
+        self,
+        prefix: Optional[str] = None,
+        limit: Optional[int] = None,
+        pagination_token: Optional[str] = None,
+        namespace: Optional[str] = None,
+        **kwargs
+    ) ->  ListResponse:
+        """
+        The list_paginated operation finds vectors based on an id prefix within a single namespace.
+        It returns matching ids in a paginated form, with a pagination token to fetch the next page of results.
+        This id list can then be passed to fetch or delete operations, depending on your use case.
+        
+        Consider using the `list` method to avoid having to handle pagination tokens manually.
+
+        Examples:
+            >>> results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace')
+            >>> [v.id for v in results.vectors]
+            ['99', '990', '991', '992', '993']
+            >>> results.pagination.next
+            eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9
+            >>> next_results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace', pagination_token=results.pagination.next)
+
+        Args:
+            prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will 
+                                    be used with the effect of listing all ids in a namespace [optional]
+            limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
+            pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned 
+                in the response if additional results are available. [optional]
+            namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
+        
+        Returns: ListResponse object which contains the list of ids, the namespace name, pagination information, and usage showing the number of read_units consumed.
+        """
+        args_dict = self._parse_non_empty_args(
+            [
+                ("prefix", prefix),
+                ("limit", limit),
+                ("namespace", namespace),
+                ("pagination_token", pagination_token),
+            ]
+        )
+        return self._vector_api.list(**args_dict, **kwargs)
+
+    @validate_and_convert_errors
+    def list(self, **kwargs):
+        """
+        The list operation accepts all of the same arguments as list_paginated, and returns a generator that yields
+        a list of the matching vector ids in each page of results. It automatically handles pagination tokens on your
+        behalf.
+
+        Examples:
+            >>> for ids in index.list(prefix='99', limit=5, namespace='my_namespace'):
+            >>>     print(ids)
+            ['99', '990', '991', '992', '993']
+            ['994', '995', '996', '997', '998']
+            ['999']
+
+        Args:
+            prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will 
+                                    be used with the effect of listing all ids in a namespace [optional]
+            limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
+            pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned 
+                in the response if additional results are available. [optional]
+            namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
+        """
+        done = False
+        while not done:
+            results = self.list_paginated(**kwargs)
+            if len(results.vectors) > 0:
+                yield [v.id for v in results.vectors]
+            
+            if results.pagination:
+                kwargs.update({"pagination_token": results.pagination.next})
+            else:
+                done = True
 
     @staticmethod
     def _parse_non_empty_args(args: List[Tuple[str, Any]]) -> Dict[str, Any]:
diff --git a/pinecone/grpc/index_grpc.py b/pinecone/grpc/index_grpc.py
@@ -13,6 +13,10 @@
     QueryResponse,
     DescribeIndexStatsResponse,
 )
+from pinecone.models.list_response import (
+    ListResponse as SimpleListResponse,
+    Pagination
+)
 from pinecone.core.grpc.protos.vector_service_pb2 import (
     Vector as GRPCVector,
     QueryVector as GRPCQueryVector,
@@ -22,6 +26,8 @@
     QueryRequest,
     FetchRequest,
     UpdateRequest,
+    ListRequest,
+    ListResponse,
     DescribeIndexStatsRequest,
     DeleteResponse,
     UpdateResponse,
@@ -41,7 +47,6 @@ class SparseVectorTypedDict(TypedDict):
     indices: List[int]
     values: List[float]
 
-
 class GRPCIndex(GRPCIndexBase):
     """A client for interacting with a Pinecone index via GRPC API."""
 
@@ -429,6 +434,98 @@ def update(
         else:
             return self._wrap_grpc_call(self.stub.Update, request, timeout=timeout)
 
+    def list_paginated(
+            self,
+            prefix: Optional[str] = None,
+            limit: Optional[int] = None,
+            pagination_token: Optional[str] = None,
+            namespace: Optional[str] = None,
+            **kwargs
+        ) -> SimpleListResponse:
+        """
+        The list_paginated operation finds vectors based on an id prefix within a single namespace.
+        It returns matching ids in a paginated form, with a pagination token to fetch the next page of results.
+        This id list can then be passed to fetch or delete operations, depending on your use case.
+        
+        Consider using the `list` method to avoid having to handle pagination tokens manually.
+
+        Examples:
+            >>> results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace')
+            >>> [v.id for v in results.vectors]
+            ['99', '990', '991', '992', '993']
+            >>> results.pagination.next
+            eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9
+            >>> next_results = index.list_paginated(prefix='99', limit=5, namespace='my_namespace', pagination_token=results.pagination.next)
+
+        Args:
+            prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will 
+                                    be used with the effect of listing all ids in a namespace [optional]
+            limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
+            pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned 
+                in the response if additional results are available. [optional]
+            namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
+        
+        Returns: SimpleListResponse object which contains the list of ids, the namespace name, pagination information, and usage showing the number of read_units consumed.
+        """
+        args_dict = self._parse_non_empty_args(
+            [
+                ("prefix", prefix),
+                ("limit", limit),
+                ("namespace", namespace),
+                ("pagination_token", pagination_token),
+            ]
+        )
+        request = ListRequest(**args_dict, **kwargs)
+        timeout = kwargs.pop("timeout", None)
+        response = self._wrap_grpc_call(self.stub.List, request, timeout=timeout)
+        
+        if response.pagination and response.pagination.next != '':
+            pagination = Pagination(next=response.pagination.next)
+        else:
+            pagination = None
+        
+        return SimpleListResponse(
+            namespace=response.namespace,
+            vectors=response.vectors,
+            pagination=pagination,
+        )
+    
+    def list(self, **kwargs):
+        """
+        The list operation accepts all of the same arguments as list_paginated, and returns a generator that yields
+        a list of the matching vector ids in each page of results. It automatically handles pagination tokens on your
+        behalf.
+
+        Examples:
+            >>> for ids in index.list(prefix='99', limit=5, namespace='my_namespace'):
+            >>>     print(ids)
+            ['99', '990', '991', '992', '993']
+            ['994', '995', '996', '997', '998']
+            ['999']
+
+        Args:
+            prefix (Optional[str]): The id prefix to match. If unspecified, an empty string prefix will 
+                                    be used with the effect of listing all ids in a namespace [optional]
+            limit (Optional[int]): The maximum number of ids to return. If unspecified, the server will use a default value. [optional]
+            pagination_token (Optional[str]): A token needed to fetch the next page of results. This token is returned 
+                in the response if additional results are available. [optional]
+            namespace (Optional[str]): The namespace to fetch vectors from. If not specified, the default namespace is used. [optional]
+        """
+        done = False
+        while not done:
+            try:
+                results = self.list_paginated(**kwargs)
+            except Exception as e:
+                raise e
+            
+            if len(results.vectors) > 0:
+                yield [v.id for v in results.vectors]
+            
+            if results.pagination and results.pagination.next:
+                kwargs.update({"pagination_token": results.pagination.next})
+            else:
+                done = True
+
     def describe_index_stats(
         self, filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, **kwargs
     ) -> DescribeIndexStatsResponse:
diff --git a/pinecone/models/list_response.py b/pinecone/models/list_response.py
@@ -0,0 +1,9 @@
+from typing import NamedTuple, Optional, List
+
+class Pagination(NamedTuple):
+    next: str
+
+class ListResponse(NamedTuple):
+    namespace: str
+    vectors: List
+    pagination: Optional[Pagination]
diff --git a/tests/integration/data/conftest.py b/tests/integration/data/conftest.py
@@ -3,7 +3,7 @@
 import time
 import json
 from ..helpers import get_environment_var, random_string
-from .seed import setup_data
+from .seed import setup_data, setup_list_data
 
 # Test matrix needs to consider the following dimensions:
 # - pod vs serverless
@@ -41,14 +41,18 @@ def spec():
 
 @pytest.fixture(scope='session')
 def index_name():
-    # return 'dataplane-lol'
     return 'dataplane-' + random_string(20)
     
 @pytest.fixture(scope='session')
 def namespace():
     # return 'banana'
     return random_string(10)
 
+@pytest.fixture(scope='session')
+def list_namespace():
+    # return 'list-banana'
+    return random_string(10)
+
 @pytest.fixture(scope='session')
 def idx(client, index_name, index_host):
     return client.Index(name=index_name, host=index_host)
@@ -57,27 +61,33 @@ def idx(client, index_name, index_host):
 def index_host(index_name, metric, spec):
     pc = build_client()
     print('Creating index with name: ' + index_name)
-    pc.create_index(
-        name=index_name, 
-        dimension=2, 
-        metric=metric, 
-        spec=spec
-    )
+    if index_name not in pc.list_indexes().names():
+        pc.create_index(
+            name=index_name, 
+            dimension=2, 
+            metric=metric, 
+            spec=spec
+        )
     description = pc.describe_index(name=index_name)
     yield description.host
+
     print('Deleting index with name: ' + index_name)
     pc.delete_index(index_name, -1)
 
 @pytest.fixture(scope='session', autouse=True)
-def seed_data(idx, namespace, index_host):
+def seed_data(idx, namespace, index_host, list_namespace):
     print('Seeding data in host ' + index_host)
 
+    print('Seeding list data in namespace "' + list_namespace + '"')
+    setup_list_data(idx, list_namespace, True)
+
     print('Seeding data in namespace "' + namespace + '"')
     setup_data(idx, namespace, False)
 
     print('Seeding data in namespace ""')
     setup_data(idx, '', True)
 
     print('Waiting a bit more to ensure freshness')
-    time.sleep(60)
+    time.sleep(120)
+
     yield
diff --git a/tests/integration/data/seed.py b/tests/integration/data/seed.py
@@ -32,3 +32,15 @@ def setup_data(idx, target_namespace, wait):
 
     if wait:
         poll_fetch_for_ids_in_namespace(idx, ids=['1', '2', '3', '4', '5', '6', '7', '8', '9'], namespace=target_namespace)
+
+def setup_list_data(idx, target_namespace, wait):
+    # Upsert a bunch more stuff for testing list pagination
+    for i in range(0, 1000, 50):
+        idx.upsert(vectors=[
+                (str(i+d), embedding_values(2)) for d in range(50)
+            ], 
+            namespace=target_namespace
+        )
+    
+    if wait:
+        poll_fetch_for_ids_in_namespace(idx, ids=['999'], namespace=target_namespace)
diff --git a/tests/integration/data/test_list.py b/tests/integration/data/test_list.py
diff --git a/tests/integration/data/test_list_errors.py b/tests/integration/data/test_list_errors.py