Skip to content

Commit

Permalink
Custom query for faster hash lookups (#99)
Browse files Browse the repository at this point in the history
Implementation based on this mailing list discussion: https://lists.gt.net/lucene/java-user/439760
Also modified the tests to check for deterministic results.
  • Loading branch information
alexklibisz authored Jun 29, 2020
1 parent cce765f commit 74815f2
Show file tree
Hide file tree
Showing 20 changed files with 410 additions and 288 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ object ContinuousBenchmark extends App {
NearestNeighborsQuery.Exact(field, Similarity.L2),
Mapping.L2Lsh(randomDenseFloats.dims, 400, 1, 3),
Seq(
Query(NearestNeighborsQuery.L2Lsh(field, 1000), k),
Query(NearestNeighborsQuery.L2Lsh(field, 1300, useMLTQuery = true), k)
Query(NearestNeighborsQuery.L2Lsh(field, 1000), k)
)
),
// Angular exact, LSH
Expand All @@ -34,8 +33,7 @@ object ContinuousBenchmark extends App {
NearestNeighborsQuery.Exact(field, Similarity.Angular),
Mapping.AngularLsh(randomDenseFloats.dims, 400, 1),
Seq(
Query(NearestNeighborsQuery.AngularLsh(field, 1000), k),
Query(NearestNeighborsQuery.AngularLsh(field, 1300, useMLTQuery = true), k),
Query(NearestNeighborsQuery.AngularLsh(field, 1000), k)
)
),
// Jaccard exact, sparse indexed, LSH
Expand All @@ -45,8 +43,7 @@ object ContinuousBenchmark extends App {
NearestNeighborsQuery.Exact(field, Similarity.Jaccard),
Mapping.JaccardLsh(randomSparseBools.dims, 400, 1),
Seq(
Query(NearestNeighborsQuery.JaccardLsh(field, 1000), k),
Query(NearestNeighborsQuery.JaccardLsh(field, 1300, useMLTQuery = true), k)
Query(NearestNeighborsQuery.JaccardLsh(field, 1000), k)
)
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ object Execute extends App {
} yield ()
}
_ <- eknnClient.execute(refreshIndex(trainIndexName, testIndexName))
_ <- eknnClient.execute(forceMerge(trainIndexName, testIndexName))
_ <- eknnClient.execute(forceMerge(trainIndexName, testIndexName).maxSegments(1))
} yield ()
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,10 @@ package object benchmarks {
)

private def algorithmName(m: Mapping, q: NearestNeighborsQuery): String = m match {
case _: SparseBool => s"Exact"
case _: DenseFloat => "Exact"
case _: SparseIndexed => "Sparse indexed"
case _: JaccardLsh | _: HammingLsh | _: AngularLsh | _: L2Lsh =>
q match {
case lsh: NearestNeighborsQuery.LshQuery if lsh.useMLTQuery => "LSH w/ MLT"
case _ => "LSH"
}
case _: SparseBool => s"Exact"
case _: DenseFloat => "Exact"
case _: SparseIndexed => "Sparse indexed"
case _: JaccardLsh | _: HammingLsh | _: AngularLsh | _: L2Lsh => "LSH"
}

def apply(benchmarkResult: BenchmarkResult): AggregateResult = {
Expand Down
2 changes: 2 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
- Re-implemented LSH and sparse-indexed queries using an optimized custom Lucene query based on the [TermInSetQuery](https://lucene.apache.org/core/8_5_0/core/org/apache/lucene/search/TermInSetQuery.html).
This is 3-5x faster on LSH benchmarks.
- Updated L1, and L2 similarities such that they're bounded in [0,1].
---
- Added an option for LSH queries to use the more-like-this heuristics to pick a subset of LSH hashes to retrieve candidate vectors.
Expand Down
63 changes: 63 additions & 0 deletions core/src/main/java/com/klibisz/elastiknn/utils/ArrayUtils.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.klibisz.elastiknn.utils;

import java.util.Arrays;

/**
* Java implementations of some hot spots.
*/
Expand Down Expand Up @@ -33,4 +35,65 @@ public static int sortedIntersectionCount(final int [] xs, final int [] ys) {
return n;
}

/**
* Find the kth largest value in the given array.
* Swaps elements in the given array.
* Based on: https://github.com/bephrem1/backtobackswe, https://www.youtube.com/watch?v=hGK_5n81drs.
* Lucene also has an implementation: https://lucene.apache.org/core/8_0_0/core/org/apache/lucene/util/IntroSelector.html,
* but it's more abstract and was slower when I benchmarked it.
* @param arr The array.
* @param k The position.
* @return The index of the kth largest value.
*/
public static int quickSelect(int[] arr, int k) {
int n = arr.length;
int left = 0;
int right = n - 1;
int finalIndexOfChoosenPivot = 0;
while (left <= right) {
int choosenPivotIndex = (right - left + 1) / 2 + left;
finalIndexOfChoosenPivot = qsPartition(arr, left, right, choosenPivotIndex);
if (finalIndexOfChoosenPivot == n - k) {
break;
} else if (finalIndexOfChoosenPivot > n - k) {
right = finalIndexOfChoosenPivot - 1;
} else {
left = finalIndexOfChoosenPivot + 1;
}
}
return arr[finalIndexOfChoosenPivot];
}

/**
* Same as quickSelect, except makes a copy of the array so the original is unmodified.
* @param arr
* @param k
* @return
*/
public static int quickSelectCopy(int[] arr, int k) {
return quickSelect(Arrays.copyOf(arr, arr.length), k);
}


private static int qsPartition(int[] arr, int left, int right, int pivotIndex) {
int pivotValue = arr[pivotIndex];
int lesserItemsTailIndex = left;
qsSwap(arr, pivotIndex, right);
for (int i = left; i < right; i++) {
if (arr[i] < pivotValue) {
qsSwap(arr, i, lesserItemsTailIndex);
lesserItemsTailIndex++;
}
}
qsSwap(arr, right, lesserItemsTailIndex);
return lesserItemsTailIndex;
}

private static void qsSwap(int[] arr, int first, int second) {
int temp = arr[first];
arr[first] = arr[second];
arr[second] = temp;
}


}
11 changes: 4 additions & 7 deletions core/src/main/scala/com/klibisz/elastiknn/api/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -110,27 +110,24 @@ package object api {

sealed trait LshQuery extends NearestNeighborsQuery {
def candidates: Int

@Experimental
def useMLTQuery: Boolean
}

final case class JaccardLsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
final case class JaccardLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.Jaccard
}

final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.Hamming
}

final case class AngularLsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
final case class AngularLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.Angular
}

final case class L2Lsh(field: String, candidates: Int, vec: Vec = Vec.Empty(), useMLTQuery: Boolean = false) extends LshQuery {
final case class L2Lsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends LshQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
override def similarity: Similarity = Similarity.L2
}
Expand Down
19 changes: 14 additions & 5 deletions core/src/main/scala/com/klibisz/elastiknn/models/LshFunction.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import com.klibisz.elastiknn.api.{Mapping, Vec}
import com.klibisz.elastiknn.storage
import com.klibisz.elastiknn.storage.StoredVec

import scala.annotation.tailrec
import scala.util.Random

sealed trait LshFunction[M <: Mapping, V <: Vec, S <: StoredVec] extends (V => Array[Int]) {
Expand Down Expand Up @@ -84,7 +85,14 @@ object LshFunction {

import mapping._
private val rng: Random = new Random(0)
private val sampledIndices: Array[Int] = (0 until bits).map(_ => rng.nextInt(dims)).sorted.toArray

// Sample indices without replacement. Important to sort them.
private val sampledIndices: Array[Int] = {
@tailrec
def sample(acc: Set[Int], i: Int): Set[Int] =
if (acc.size == bits.min(dims)) acc else if (acc(i)) sample(acc, rng.nextInt(dims)) else sample(acc + i, rng.nextInt(dims))
sample(Set.empty, rng.nextInt(dims)).toArray.sorted
}

override def apply(vec: Vec.SparseBool): Array[Int] = {
val hashes = new Array[Int](bits)
Expand Down Expand Up @@ -182,10 +190,10 @@ object LshFunction {
private val biases: Array[Float] = (0 until (bands * rows)).map(_ => rng.nextFloat() * width).toArray

override def apply(v: Vec.DenseFloat): Array[Int] = {
val bandHashes = new Array[Int](bands)
val bandHashes = collection.mutable.Set.empty[Int]
var ixBandHashes = 0
var ixHashVecs = 0
while (ixBandHashes < bandHashes.length) {
while (ixBandHashes < bands) {
var bandHash = ixBandHashes
var ixRows = 0
while (ixRows < rows) {
Expand All @@ -194,10 +202,11 @@ object LshFunction {
ixRows += 1
ixHashVecs += 1
}
bandHashes.update(ixBandHashes, bandHash)
bandHashes.add(bandHash)
ixBandHashes += 1
}
bandHashes
// TODO: figure out how to reduce number of duplicate hashes and just use an array allocated at the start of the function.
bandHashes.toArray
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ package com.klibisz.elastiknn

package object models {

private[models] val HASH_PRIME: Int = 2038074743
private[models] val HASH_PRIME: Int = 2038074743 // 2038074769

}
47 changes: 17 additions & 30 deletions docs/pages/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,24 +491,16 @@ They first retrieve approximate neighbors based on common hash terms and then co
The exact steps are as follows:

1. Hash the query vector using model parameters that were specified in the indexed vector's mapping.
2. Use the hash values to create an approximate query using one of two strategies:
- A [boolean query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html) with a _should_ clauses for every hash.
This is currently the default.
- A boolean query with _should match_ clauses for a subset of terms, with terms picked using heuristics from the
[More Like This query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html).
This is generally faster, but is still experimental. To use this strategy, set `useMLTQuery` to `true` in the query JSON.
3. Execute the approximate query to get vectors with matching hashes.
3. Pass Vectors with matching hashes to a scoring function which maintains a min-heap of size `candidates`.
The heap maintains the highest-scoring approximate neighbors.
If an indexed vector exceeds the lowest approximate score in the heap, compute its exact similarity and replace it in the heap.
Otherwise, it gets a score of 0.

If you set `candidates` to 0, the query skips all heap-related logic and exact similarity computations.
The score for each vector is the number of intersecting hash terms.

It seems reasonable to set `candidates` to 2x to 10x larger than the number of hits you want to return (Elasticsearch defaults to 10 hits).
For example, if you have 100,000 vectors in your index, you want the 10 nearest neighbors, and you set `candidates` to 100,
then your query will compute the exact similarity roughly 100 times per shard.
2. Use the hash values to construct and execute a query that finds other vectors with the same hash values.
The query is a modification of Lucene's [TermInSetQuery](https://lucene.apache.org/core/8_5_0/core/org/apache/lucene/search/TermInSetQuery.html).
3. Take the top vectors with the most matching hashes and compute their exact similarity to the query vector.
The `candidates` parameter controls the number of exact similarity computations.
Specifically, we compute exact similarity for the top _`candidates`_ candidate vectors in each segment.
As a reminder, each Elasticsearch index has >= 1 shards, and each shard has >= 1 segments.
That means if you set `"candiates": 200` for an index with 2 shards, each with 3 segments, then you'll compute the
exact similarity for `2 * 3 * 200 = 1200` vectors.
`candidates` must be set to a number greater or equal to the number of Elasticsearch results you want to get.
Higher values generally mean higher recall and higher latency.

### Jaccard LSH Query

Expand All @@ -526,8 +518,7 @@ GET /my-index/_search
},
"model": "lsh", # 3
"similarity": "jaccard", # 4
"candidates": 50, # 5
"useMLTQuery": false # 6
"candidates": 50 # 5
}
}
}
Expand All @@ -539,8 +530,7 @@ GET /my-index/_search
|2|Query vector. Must be literal sparse bool or a pointer to an indexed sparse bool vector.|
|3|Model name.|
|4|Similarity function.|
|5|Number of candidates. See the section on LSH Search Strategy.|
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
|5|Number of candidates per segment. See the section on LSH Search Strategy.|

### Hamming LSH Query

Expand All @@ -558,8 +548,7 @@ GET /my-index/_search
},
"model": "lsh", # 3
"similarity": "hamming", # 4
"candidates": 50, # 5
"useMLTQuery": false # 6
"candidates": 50 # 5
}
}
}
Expand All @@ -571,7 +560,7 @@ GET /my-index/_search
|2|Query vector. Must be literal sparse bool or a pointer to an indexed sparse bool vector.|
|3|Model name.|
|4|Similarity function.|
|5|Number of candidates. See the section on LSH Search Strategy.|
|5|Number of candidates per segment. See the section on LSH Search Strategy.|
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|

### Angular LSH Query
Expand All @@ -589,8 +578,7 @@ GET /my-index/_search
},
"model": "lsh", # 3
"similarity": "angular", # 4
"candidates": 50, # 5
"useMLTQuery": false # 6
"candidates": 50 # 5
}
}
}
Expand All @@ -602,7 +590,7 @@ GET /my-index/_search
|2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.|
|3|Model name.|
|4|Similarity function.|
|5|Number of candidates. See the section on LSH Search Strategy.|
|5|Number of candidates per segment. See the section on LSH Search Strategy.|
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|

### L1 LSH Query
Expand All @@ -625,7 +613,6 @@ GET /my-index/_search
"model": "lsh", # 3
"similarity": "l2", # 4
"candidates": 50 # 5
"useMLTQuery": false # 6
}
}
}
Expand All @@ -637,7 +624,7 @@ GET /my-index/_search
|2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.|
|3|Model name.|
|4|Similarity function.|
|5|Number of candidates. See the section on LSH Search Strategy.|
|5|Number of candidates per segment. See the section on LSH Search Strategy.|
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|

## Mapping and Query Compatibility
Expand Down
1 change: 1 addition & 0 deletions plugin/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies {
implementation "org.scala-lang:scala-library:${scalaVersion}"
implementation "com.google.guava:guava:28.1-jre"
runtime "com.google.guava:guava:28.1-jre"
implementation 'com.carrotsearch:hppc:0.8.2'
}

esplugin {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import java.util

import com.klibisz.elastiknn.api.ElasticsearchCodec._
import com.klibisz.elastiknn.api.{ElasticsearchCodec, JavaJsonMap, Mapping, Vec}
import com.klibisz.elastiknn.query.{ExactQuery, LshQuery, SparseIndexedQuery}
import com.klibisz.elastiknn.query.{ExactQuery, LshFunctionCache, LshQuery, SparseIndexedQuery}
import com.klibisz.elastiknn.{ELASTIKNN_NAME, VectorDimensionException}
import io.circe.syntax._
import io.circe.{Json, JsonObject}
Expand Down Expand Up @@ -32,8 +32,8 @@ object VectorMapper {
mapping match {
case Mapping.SparseBool(_) => Try(ExactQuery.index(field, sorted))
case Mapping.SparseIndexed(_) => Try(SparseIndexedQuery.index(field, fieldType, sorted))
case m: Mapping.JaccardLsh => Try(LshQuery.index(field, fieldType, sorted, m))
case m: Mapping.HammingLsh => Try(LshQuery.index(field, fieldType, sorted, m))
case m: Mapping.JaccardLsh => Try(LshQuery.index(field, fieldType, sorted, LshFunctionCache.Jaccard(m)))
case m: Mapping.HammingLsh => Try(LshQuery.index(field, fieldType, sorted, LshFunctionCache.Hamming(m)))
case _ => Failure(incompatible(mapping, vec))
}
}
Expand All @@ -47,8 +47,8 @@ object VectorMapper {
else
mapping match {
case Mapping.DenseFloat(_) => Try(ExactQuery.index(field, vec))
case m: Mapping.AngularLsh => Try(LshQuery.index(field, fieldType, vec, m))
case m: Mapping.L2Lsh => Try(LshQuery.index(field, fieldType, vec, m))
case m: Mapping.AngularLsh => Try(LshQuery.index(field, fieldType, vec, LshFunctionCache.Angular(m)))
case m: Mapping.L2Lsh => Try(LshQuery.index(field, fieldType, vec, LshFunctionCache.L2(m)))
case _ => Failure(incompatible(mapping, vec))
}
}
Expand All @@ -65,6 +65,7 @@ object VectorMapper {
setBoost(1f)
setTokenized(false)
setIndexOptions(IndexOptions.DOCS)
setStoreTermVectors(false)

override def typeName(): String = typeName
override def clone(): FieldType = new FieldType(typeName)
Expand Down
Loading

0 comments on commit 74815f2

Please sign in to comment.