diff --git a/examples/vector_search.py b/examples/vector_search.py index cad4103..29e8b96 100644 --- a/examples/vector_search.py +++ b/examples/vector_search.py @@ -10,7 +10,7 @@ from sentence_transformers import SentenceTransformer from coherence import NamedMap, Session -from coherence.ai import FloatVector, QueryResult, SimilaritySearch, Vectors +from coherence.ai import FloatVector, HnswIndex, QueryResult, SimilaritySearch, Vectors from coherence.extractor import Extractors, ValueExtractor from coherence.filter import Filter, Filters @@ -100,6 +100,18 @@ The SimilaritySearch aggregator is used to perform a Knn vector search on a cache in the same way that normal Coherence aggregators are used. + +HNSW Indexing +============= + +Coherence includes an implementation of the HNSW index that can be used to +speed up searches. The hierarchical navigable small world (HNSW) algorithm is +a graph-based approximate nearest neighbor search technique. + +An index is added to a cache in Coherence by calling the add_index method on +the cache. In this example, a HNSWIndex is created with a ValueExtractor that +will extract the vector field from the cache value and an int parameter that +specifies the number of dimensions the vector has. """ @@ -226,6 +238,11 @@ async def do_run() -> None: # NamedMap as a parameter movies_repo = MovieRepository(movie_db) + # To speed up the search, HNSW index is added to the cache using the + # VALUE_EXTRACTOR for the full plot vector and the dimensions of the + # vector. + await movie_db.add_index(HnswIndex(MovieRepository.VALUE_EXTRACTOR, MovieRepository.EMBEDDING_DIMENSIONS)) + # All of the movies data from filename MOVIE_JSON_FILENAME is # processed and loaded into the movies_repo await movies_repo.load(MOVIE_JSON_FILENAME) diff --git a/src/coherence/ai.py b/src/coherence/ai.py index 3b62ff2..3f308af 100644 --- a/src/coherence/ai.py +++ b/src/coherence/ai.py @@ -392,7 +392,7 @@ class HnswIndex(AbstractEvolvable): def __init__( self, extractor: Union[ValueExtractor[T, E], str], - dimensions: int, + dimension: int, space_name: str = DEFAULT_SPACE_NAME, max_elements: int = DEFAULT_MAX_ELEMENTS, m: int = DEFAULT_M, @@ -404,7 +404,7 @@ def __init__( Creates an instance of HnswIndex class. :param extractor: The ValueExtractor to use to extract the Vector. - :param dimensions: The number of dimensions in the vector. + :param dimension: The number of dimensions in the vector. :param space_name: The index space name. :param max_elements: The maximum number of elements the index can contain. :param m: The number of bidirectional links created for every new element during construction. @@ -415,7 +415,7 @@ def __init__( super().__init__() self.extractor = extractor - self.dimensions = dimensions + self.dimension = dimension self.spaceName = space_name if space_name else "" self.maxElements = max_elements self.m = m diff --git a/tests/unit/test_serialization.py b/tests/unit/test_serialization.py index ba27977..e90b788 100644 --- a/tests/unit/test_serialization.py +++ b/tests/unit/test_serialization.py @@ -263,7 +263,7 @@ def test_HnswIndex_serialization() -> None: assert ser == ( b'\x15{"@class": "coherence.hnsw.HnswIndex", "dataVersion": 0, ' b'"binFuture": null, "extractor": {"@class": "extractor.UniversalExtractor", ' - b'"name": "foo", "params": null}, "dimensions": 384, "spaceName": "COSINE", ' + b'"name": "foo", "params": null}, "dimension": 384, "spaceName": "COSINE", ' b'"maxElements": 4096, "m": 16, "efConstruction": 200, "efSearch": 50, ' b'"randomSeed": 100}' )