From 5ef4cc762044c3bb52d622d12f2af6c3cc79bcdb Mon Sep 17 00:00:00 2001 From: Alex Klibisz Date: Mon, 6 Apr 2020 18:24:55 -0400 Subject: [PATCH] Minimize readme and possibly fix release --- .github/workflows/release.yml | 26 ++++---- readme.md | 115 +--------------------------------- 2 files changed, 15 insertions(+), 126 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fb0b2873b..ef663e88e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -34,19 +34,19 @@ jobs: with: java-version: 12.0.2 - # Release docs - - name: Setup Docs - if: github.event_name == 'push' && env.GET_RELEASE == '404' - run: | - cp docs/Gemfile . - - # Specify the Jekyll source location as a parameter - - uses: helaili/jekyll-action@2.0.0 - if: github.event_name == 'push' && env.GET_RELEASE == '404' - env: - JEKYLL_PAT: ${{ secrets.JEKYLL_PAT }} - with: - jekyll_src: 'docs' +# # Release docs +# - name: Setup Docs +# if: github.event_name == 'push' && env.GET_RELEASE == '404' +# run: | +# cp docs/Gemfile . +# +# # Specify the Jekyll source location as a parameter +# - uses: helaili/jekyll-action@2.0.0 +# if: github.event_name == 'push' && env.GET_RELEASE == '404' +# env: +# JEKYLL_PAT: ${{ secrets.JEKYLL_PAT }} +# with: +# jekyll_src: 'docs' # Release software - name: Release software artifacts diff --git a/readme.md b/readme.md index 18c0f5383..62af50fd5 100644 --- a/readme.md +++ b/readme.md @@ -2,6 +2,8 @@ An Elasticsearch plugin for exact and approximate K-nearest-neighbors search in high-dimensional vector spaces. +**[Documentation](http://alexklibisz.github.io/elastiknn)** + ## Builds and Releases |Item|Status| @@ -14,119 +16,6 @@ An Elasticsearch plugin for exact and approximate K-nearest-neighbors search in |Scala 2.12 Client, Release| [![Scala Client Release Status][Badge-Scala-Release]][Link-Scala-Release]| |Scala 2.12 Client, Snapshot| [![Scala Client Snapshot Status][Badge-Scala-Snapshot]][Link-Scala-Snapshot]| -## Features - -### Completed - -**Exact KNN search for five distance functions: L1, L2, Angular, Hamming, and Jaccard.** - -This is fairly thoroughly profiled and optimized, but it's still an n^2 algorithm so it should only be used for testing -and on relatively small datasets. - -**Approximate KNN using Locality Sensitive Hashing; currently only works for Jaccard similarity** - -This should scale much better for large datasets. I'm working on implementations for the other similarity functions as -well as a multiprobe-LSH variant where possible. - -**Pipeline Processors for ingesting vectors** - -The plugin uses [pipeline processors](https://www.elastic.co/guide/en/elasticsearch/reference/current/pipeline-processor.html) -to validate, pre-process, and index vectors. This means you can just pass in the vectors as JSON documents. - -**Integrates KNN queries seamlessly with existing Elasticsearch queries** - -The exact and approximate KNN searches are implemented as Elasticsearch queries. This means you can store your vectors -inside of existing documents and mix your queries into existing queries. The semantics resemble -[GeoShape queries](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-geo-shape-query.html). -Under the hood, the queries are implemented using standard Elasticsearch constructs, specifically -[Function Score Queries](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html) -and [Boolean Queries](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html). - -### Forthcoming - -**Approximate KNN for L1, L2, Angular and Hamming similarities** - -The API will be the same as it is for Jaccard. It's just a matter of implementing and optimizing the internals. - -**Multiprobe LSH Queries** - -Multiprobe LSH has been shown to improve performance. I'm planning to implement this for the similarity functions where -it makes sense. The API should be a simple extension of the existing approximate queries. - -**Fixed radius KNN queries** - -Instead of returning the nearest vectors, this query will return the vectors that fall within some radius of a given -query vector. - -## Usage - -### Install ElastiKnn on an ElasticSearch cluster - -TODO - -### Run a Docker container with ElastiKnn already installed - -TODO - -### Exact search using the Elasticsearch REST API - -TODO - -### Python Client - -TODO - -### Scala Client - -TODO - -## Performance - -### Ann-Benchmarks - -Currently working on this in a fork of the [Ann-Benchmarks repo here](https://github.com/alexklibisz/ann-benchmarks). -Planning to submit a PR when all of the approximate similarities are implemented and the Docker image can be built with -a release elastiknn zip file. - -### Million-Scale - -TODO - -Planning to implement this using one of the various word vector datasets. - -### Billion-Scale - -TODO - -Not super sure of the feasability of this yet. There are some notes in benchmarks/billion. - -## Development - -### Builds and Releases - -There are three main artifacts produced by this project: - -1. The actual plugin, which is a zip file published to Github releases. -2. The python client library, which gets published to PyPi. -3. The scala client library, which gets published to Sonatype. - -All three artifacts are built and published as "snapshots" on every PR commit and every push/merge to master. All three -artifacts are released on every push/merge to master in which the `version` file has changed. We detect a change in the -version file by checking if a release tag exists with the same name as the version. - -All of this is handled by Github Workflows with all steps defined in the yaml files in `.github/workflows`. - -## References - -In no particular order: - -- [Alex Reelsen](https://github.com/spinscale) has several open-source plugins which were useful examples for the general structure of a plugin project: -- [Mining of Massive Datasets (MMDS) by Leskovec, et. al](http://www.mmds.org/), particularly chapter 3, is a great reference for approximate similarity search. -- [The Read Only Rest Plugin](https://github.com/sscarduzio/elasticsearch-readonlyrest-plugin) served as an example for much of the Gradle and testing setup. -- [The Scalable Data Science Lectures on Youtube](https://www.youtube.com/playlist?list=PLbRMhDVUMngekIHyLt8b_3jQR7C0KUCul) were helpful for better understanding LSH. I think much of that content is also based on the MMDS book. -- [_Build Your Own Custom Lucene Query and Scorer_, Doug Turnball](https://opensourceconnections.com/blog/2014/01/20/build-your-own-custom-lucene-query-and-scorer/) was extremely helpful -for implementing custom Lucene indexing and queries. - [Link-Github-CI]: https://github.com/alexklibisz/elastiknn/actions?query=workflow%3ACI