Custom query for faster hash lookups (#99)

Implementation based on this mailing list discussion: https://lists.gt.net/lucene/java-user/439760 Also modified the tests to check for deterministic results.
alexklibisz · Jun 29, 2020 · 74815f2 · 74815f2
1 parent cce765f
commit 74815f2
Show file tree

Hide file tree

Showing 20 changed files with 410 additions and 288 deletions.
diff --git a/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/ContinuousBenchmark.scala b/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/ContinuousBenchmark.scala
@@ -23,8 +23,7 @@ object ContinuousBenchmark extends App {
       NearestNeighborsQuery.Exact(field, Similarity.L2),
       Mapping.L2Lsh(randomDenseFloats.dims, 400, 1, 3),
       Seq(
-        Query(NearestNeighborsQuery.L2Lsh(field, 1000), k),
-        Query(NearestNeighborsQuery.L2Lsh(field, 1300, useMLTQuery = true), k)
+        Query(NearestNeighborsQuery.L2Lsh(field, 1000), k)
       )
     ),
     // Angular exact, LSH
@@ -34,8 +33,7 @@ object ContinuousBenchmark extends App {
       NearestNeighborsQuery.Exact(field, Similarity.Angular),
       Mapping.AngularLsh(randomDenseFloats.dims, 400, 1),
       Seq(
-        Query(NearestNeighborsQuery.AngularLsh(field, 1000), k),
-        Query(NearestNeighborsQuery.AngularLsh(field, 1300, useMLTQuery = true), k),
+        Query(NearestNeighborsQuery.AngularLsh(field, 1000), k)
       )
     ),
     // Jaccard exact, sparse indexed, LSH
@@ -45,8 +43,7 @@ object ContinuousBenchmark extends App {
       NearestNeighborsQuery.Exact(field, Similarity.Jaccard),
       Mapping.JaccardLsh(randomSparseBools.dims, 400, 1),
       Seq(
-        Query(NearestNeighborsQuery.JaccardLsh(field, 1000), k),
-        Query(NearestNeighborsQuery.JaccardLsh(field, 1300, useMLTQuery = true), k)
+        Query(NearestNeighborsQuery.JaccardLsh(field, 1000), k)
       )
     )
   )

diff --git a/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/Execute.scala b/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/Execute.scala
@@ -104,7 +104,7 @@ object Execute extends App {
             } yield ()
         }
         _ <- eknnClient.execute(refreshIndex(trainIndexName, testIndexName))
-        _ <- eknnClient.execute(forceMerge(trainIndexName, testIndexName))
+        _ <- eknnClient.execute(forceMerge(trainIndexName, testIndexName).maxSegments(1))
       } yield ()
     }
 

diff --git a/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/package.scala b/benchmarks/src/main/scala/com/klibisz/elastiknn/benchmarks/package.scala
@@ -97,14 +97,10 @@ package object benchmarks {
     )
 
     private def algorithmName(m: Mapping, q: NearestNeighborsQuery): String = m match {
-      case _: SparseBool    => s"Exact"
-      case _: DenseFloat    => "Exact"
-      case _: SparseIndexed => "Sparse indexed"
-      case _: JaccardLsh | _: HammingLsh | _: AngularLsh | _: L2Lsh =>
-        q match {
-          case lsh: NearestNeighborsQuery.LshQuery if lsh.useMLTQuery => "LSH w/ MLT"
-          case _                                                      => "LSH"
-        }
+      case _: SparseBool                                            => s"Exact"
+      case _: DenseFloat                                            => "Exact"
+      case _: SparseIndexed                                         => "Sparse indexed"
+      case _: JaccardLsh | _: HammingLsh | _: AngularLsh | _: L2Lsh => "LSH"
     }
 
     def apply(benchmarkResult: BenchmarkResult): AggregateResult = {

diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,5 @@
+- Re-implemented LSH and sparse-indexed queries using an optimized custom Lucene query based on the [TermInSetQuery](https://lucene.apache.org/core/8_5_0/core/org/apache/lucene/search/TermInSetQuery.html).
+  This is 3-5x faster on LSH benchmarks.
 - Updated L1, and L2 similarities such that they're bounded in [0,1].
 ---
 - Added an option for LSH queries to use the more-like-this heuristics to pick a subset of LSH hashes to retrieve candidate vectors.

diff --git a/core/src/main/java/com/klibisz/elastiknn/utils/ArrayUtils.java b/core/src/main/java/com/klibisz/elastiknn/utils/ArrayUtils.java
@@ -1,5 +1,7 @@
 package com.klibisz.elastiknn.utils;
 
+import java.util.Arrays;
+
 /**
  * Java implementations of some hot spots.
  */
@@ -33,4 +35,65 @@ public static int sortedIntersectionCount(final int [] xs, final int [] ys) {
         return n;
     }
 
+    /**
+     * Find the kth largest value in the given array.
+     * Swaps elements in the given array.
+     * Based on: https://github.com/bephrem1/backtobackswe, https://www.youtube.com/watch?v=hGK_5n81drs.
+     * Lucene also has an implementation: https://lucene.apache.org/core/8_0_0/core/org/apache/lucene/util/IntroSelector.html,
+     * but it's more abstract and was slower when I benchmarked it.
+     * @param arr The array.
+     * @param k The position.
+     * @return The index of the kth largest value.
+     */
+    public static int quickSelect(int[] arr, int k) {
+        int n = arr.length;
+        int left = 0;
+        int right = n - 1;
+        int finalIndexOfChoosenPivot = 0;
+        while (left <= right) {
+            int choosenPivotIndex = (right - left + 1) / 2 + left;
+            finalIndexOfChoosenPivot = qsPartition(arr, left, right, choosenPivotIndex);
+            if (finalIndexOfChoosenPivot == n - k) {
+                break;
+            } else if (finalIndexOfChoosenPivot > n - k) {
+                right = finalIndexOfChoosenPivot - 1;
+            } else {
+                left = finalIndexOfChoosenPivot + 1;
+            }
+        }
+        return arr[finalIndexOfChoosenPivot];
+    }
+
+    /**
+     * Same as quickSelect, except makes a copy of the array so the original is unmodified.
+     * @param arr
+     * @param k
+     * @return
+     */
+    public static int quickSelectCopy(int[] arr, int k) {
+        return quickSelect(Arrays.copyOf(arr, arr.length), k);
+    }
+
+
+    private static int qsPartition(int[] arr, int left, int right, int pivotIndex) {
+        int pivotValue = arr[pivotIndex];
+        int lesserItemsTailIndex = left;
+        qsSwap(arr, pivotIndex, right);
+        for (int i = left; i < right; i++) {
+            if (arr[i] < pivotValue) {
+                qsSwap(arr, i, lesserItemsTailIndex);
+                lesserItemsTailIndex++;
+            }
+        }
+        qsSwap(arr, right, lesserItemsTailIndex);
+        return lesserItemsTailIndex;
+    }
+
+    private static void qsSwap(int[] arr, int first, int second) {
+        int temp = arr[first];
+        arr[first] = arr[second];
+        arr[second] = temp;
+    }
+
+
 }
diff --git a/core/src/main/scala/com/klibisz/elastiknn/api/package.scala b/core/src/main/scala/com/klibisz/elastiknn/api/package.scala
@@ -110,27 +110,24 @@ package object api {
 
     sealed trait LshQuery extends NearestNeighborsQuery {
       def candidates: Int
-
-      @Experimental
-      def useMLTQuery: Boolean
     }
 
-    final case class JaccardLsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
+    final case class JaccardLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
       override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
       override def similarity: Similarity = Similarity.Jaccard
     }
 
-    final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
+    final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
       override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
       override def similarity: Similarity = Similarity.Hamming
     }
 
-    final case class AngularLsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
+    final case class AngularLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
       override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
       override def similarity: Similarity = Similarity.Angular
     }
 
-    final case class L2Lsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
+    final case class L2Lsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
       override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
       override def similarity: Similarity = Similarity.L2
     }

diff --git a/core/src/main/scala/com/klibisz/elastiknn/models/LshFunction.scala b/core/src/main/scala/com/klibisz/elastiknn/models/LshFunction.scala
@@ -4,6 +4,7 @@ import com.klibisz.elastiknn.api.{Mapping, Vec}
 import com.klibisz.elastiknn.storage
 import com.klibisz.elastiknn.storage.StoredVec
 
+import scala.annotation.tailrec
 import scala.util.Random
 
 sealed trait LshFunction[M <: Mapping, V <: Vec, S <: StoredVec] extends (V => Array[Int]) {
@@ -84,7 +85,14 @@ object LshFunction {
 
     import mapping._
     private val rng: Random = new Random(0)
-    private val sampledIndices: Array[Int] = (0 until bits).map(_ => rng.nextInt(dims)).sorted.toArray
+
+    // Sample indices without replacement. Important to sort them.
+    private val sampledIndices: Array[Int] = {
+      @tailrec
+      def sample(acc: Set[Int], i: Int): Set[Int] =
+        if (acc.size == bits.min(dims)) acc else if (acc(i)) sample(acc, rng.nextInt(dims)) else sample(acc + i, rng.nextInt(dims))
+      sample(Set.empty, rng.nextInt(dims)).toArray.sorted
+    }
 
     override def apply(vec: Vec.SparseBool): Array[Int] = {
       val hashes = new Array[Int](bits)
@@ -182,10 +190,10 @@ object LshFunction {
     private val biases: Array[Float] = (0 until (bands * rows)).map(_ => rng.nextFloat() * width).toArray
 
     override def apply(v: Vec.DenseFloat): Array[Int] = {
-      val bandHashes = new Array[Int](bands)
+      val bandHashes = collection.mutable.Set.empty[Int]
       var ixBandHashes = 0
       var ixHashVecs = 0
-      while (ixBandHashes < bandHashes.length) {
+      while (ixBandHashes < bands) {
         var bandHash = ixBandHashes
         var ixRows = 0
         while (ixRows < rows) {
@@ -194,10 +202,11 @@ object LshFunction {
           ixRows += 1
           ixHashVecs += 1
         }
-        bandHashes.update(ixBandHashes, bandHash)
+        bandHashes.add(bandHash)
         ixBandHashes += 1
       }
-      bandHashes
+      // TODO: figure out how to reduce number of duplicate hashes and just use an array allocated at the start of the function.
+      bandHashes.toArray
     }
   }
 

diff --git a/core/src/main/scala/com/klibisz/elastiknn/models/package.scala b/core/src/main/scala/com/klibisz/elastiknn/models/package.scala
@@ -2,6 +2,6 @@ package com.klibisz.elastiknn
 
 package object models {
 
-  private[models] val HASH_PRIME: Int = 2038074743
+  private[models] val HASH_PRIME: Int = 2038074743 // 2038074769
 
 }
diff --git a/docs/pages/api.md b/docs/pages/api.md
@@ -491,24 +491,16 @@ They first retrieve approximate neighbors based on common hash terms and then co
 The exact steps are as follows:
 
 1. Hash the query vector using model parameters that were specified in the indexed vector's mapping.
-2. Use the hash values to create an approximate query using one of two strategies:
-    - A [boolean query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html) with a _should_ clauses for every hash. 
-    This is currently the default.
-    - A boolean query with _should match_ clauses for a subset of terms, with terms picked using heuristics from the 
-    [More Like This query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html). 
-    This is generally faster, but is still experimental. To use this strategy, set `useMLTQuery` to `true` in the query JSON.
-3. Execute the approximate query to get vectors with matching hashes. 
-3. Pass Vectors with matching hashes to a scoring function which maintains a min-heap of size `candidates`. 
-The heap maintains the highest-scoring approximate neighbors.
-If an indexed vector exceeds the lowest approximate score in the heap, compute its exact similarity and replace it in the heap. 
-Otherwise, it gets a score of 0.
-
-If you set `candidates` to 0, the query skips all heap-related logic and exact similarity computations. 
-The score for each vector is the number of intersecting hash terms.
-
-It seems reasonable to set `candidates` to 2x to 10x larger than the number of hits you want to return (Elasticsearch defaults to 10 hits). 
-For example, if you have 100,000 vectors in your index, you want the 10 nearest neighbors, and you set `candidates` to 100, 
-then your query will compute the exact similarity roughly 100 times per shard.
+2. Use the hash values to construct and execute a query that finds other vectors with the same hash values.
+   The query is a modification of Lucene's [TermInSetQuery](https://lucene.apache.org/core/8_5_0/core/org/apache/lucene/search/TermInSetQuery.html).
+3. Take the top vectors with the most matching hashes and compute their exact similarity to the query vector.
+   The `candidates` parameter controls the number of exact similarity computations.
+   Specifically, we compute exact similarity for the top _`candidates`_ candidate vectors in each segment.
+   As a reminder, each Elasticsearch index has >= 1 shards, and each shard has >= 1 segments.
+   That means if you set `"candiates": 200` for an index with 2 shards, each with 3 segments, then you'll compute the 
+   exact similarity for `2 * 3 * 200 = 1200` vectors.
+   `candidates` must be set to a number greater or equal to the number of Elasticsearch results you want to get.
+   Higher values generally mean higher recall and higher latency.
 
 ### Jaccard LSH Query
 
@@ -526,8 +518,7 @@ GET /my-index/_search
             },
             "model": "lsh",                        # 3
             "similarity": "jaccard",               # 4
-            "candidates": 50,                      # 5
-            "useMLTQuery": false                   # 6
+            "candidates": 50                       # 5
         }
     }
 }
@@ -539,8 +530,7 @@ GET /my-index/_search
 |2|Query vector. Must be literal sparse bool or a pointer to an indexed sparse bool vector.|
 |3|Model name.|
 |4|Similarity function.|
-|5|Number of candidates. See the section on LSH Search Strategy.|
-|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
+|5|Number of candidates per segment. See the section on LSH Search Strategy.|
 
 ### Hamming LSH Query
 
@@ -558,8 +548,7 @@ GET /my-index/_search
             },
             "model": "lsh",                        # 3
             "similarity": "hamming",               # 4
-            "candidates": 50,                      # 5
-            "useMLTQuery": false                   # 6
+            "candidates": 50                       # 5
         }
     }
 }
@@ -571,7 +560,7 @@ GET /my-index/_search
 |2|Query vector. Must be literal sparse bool or a pointer to an indexed sparse bool vector.|
 |3|Model name.|
 |4|Similarity function.|
-|5|Number of candidates. See the section on LSH Search Strategy.|
+|5|Number of candidates per segment. See the section on LSH Search Strategy.|
 |6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
 
 ### Angular LSH Query
@@ -589,8 +578,7 @@ GET /my-index/_search
             },
             "model": "lsh",                        # 3
             "similarity": "angular",               # 4
-            "candidates": 50,                      # 5
-            "useMLTQuery": false                   # 6
+            "candidates": 50                       # 5
         }
     }
 }
@@ -602,7 +590,7 @@ GET /my-index/_search
 |2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.|
 |3|Model name.|
 |4|Similarity function.|
-|5|Number of candidates. See the section on LSH Search Strategy.|
+|5|Number of candidates per segment. See the section on LSH Search Strategy.|
 |6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
 
 ### L1 LSH Query
@@ -625,7 +613,6 @@ GET /my-index/_search
             "model": "lsh",                        # 3
             "similarity": "l2",                    # 4
             "candidates": 50                       # 5
-            "useMLTQuery": false                   # 6
         }
     }
 }
@@ -637,7 +624,7 @@ GET /my-index/_search
 |2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.|
 |3|Model name.|
 |4|Similarity function.|
-|5|Number of candidates. See the section on LSH Search Strategy.|
+|5|Number of candidates per segment. See the section on LSH Search Strategy.|
 |6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
 
 ## Mapping and Query Compatibility

diff --git a/plugin/build.gradle b/plugin/build.gradle
@@ -36,6 +36,7 @@ dependencies {
     implementation "org.scala-lang:scala-library:${scalaVersion}"
     implementation "com.google.guava:guava:28.1-jre"
     runtime "com.google.guava:guava:28.1-jre"
+    implementation 'com.carrotsearch:hppc:0.8.2'
 }
 
 esplugin {

diff --git a/plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala b/plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala
@@ -4,7 +4,7 @@ import java.util
 
 import com.klibisz.elastiknn.api.ElasticsearchCodec._
 import com.klibisz.elastiknn.api.{ElasticsearchCodec, JavaJsonMap, Mapping, Vec}
-import com.klibisz.elastiknn.query.{ExactQuery, LshQuery, SparseIndexedQuery}
+import com.klibisz.elastiknn.query.{ExactQuery, LshFunctionCache, LshQuery, SparseIndexedQuery}
 import com.klibisz.elastiknn.{ELASTIKNN_NAME, VectorDimensionException}
 import io.circe.syntax._
 import io.circe.{Json, JsonObject}
@@ -32,8 +32,8 @@ object VectorMapper {
           mapping match {
             case Mapping.SparseBool(_)    => Try(ExactQuery.index(field, sorted))
             case Mapping.SparseIndexed(_) => Try(SparseIndexedQuery.index(field, fieldType, sorted))
-            case m: Mapping.JaccardLsh    => Try(LshQuery.index(field, fieldType, sorted, m))
-            case m: Mapping.HammingLsh    => Try(LshQuery.index(field, fieldType, sorted, m))
+            case m: Mapping.JaccardLsh    => Try(LshQuery.index(field, fieldType, sorted, LshFunctionCache.Jaccard(m)))
+            case m: Mapping.HammingLsh    => Try(LshQuery.index(field, fieldType, sorted, LshFunctionCache.Hamming(m)))
             case _                        => Failure(incompatible(mapping, vec))
           }
         }
@@ -47,8 +47,8 @@ object VectorMapper {
         else
           mapping match {
             case Mapping.DenseFloat(_) => Try(ExactQuery.index(field, vec))
-            case m: Mapping.AngularLsh => Try(LshQuery.index(field, fieldType, vec, m))
-            case m: Mapping.L2Lsh      => Try(LshQuery.index(field, fieldType, vec, m))
+            case m: Mapping.AngularLsh => Try(LshQuery.index(field, fieldType, vec, LshFunctionCache.Angular(m)))
+            case m: Mapping.L2Lsh      => Try(LshQuery.index(field, fieldType, vec, LshFunctionCache.L2(m)))
             case _                     => Failure(incompatible(mapping, vec))
           }
     }
@@ -65,6 +65,7 @@ object VectorMapper {
     setBoost(1f)
     setTokenized(false)
     setIndexOptions(IndexOptions.DOCS)
+    setStoreTermVectors(false)
 
     override def typeName(): String = typeName
     override def clone(): FieldType = new FieldType(typeName)