From f6e00c48d9b8891da582cc96b3a44b40de31518b Mon Sep 17 00:00:00 2001 From: Mike Pellegrini Date: Fri, 5 Jul 2024 13:55:24 -0400 Subject: [PATCH] Semantic text notebook (#271) --- README.md | 3 + bin/find-notebooks-to-test.sh | 5 + notebooks/search/09-semantic-text.ipynb | 570 ++++++++++++++++++++++++ notebooks/search/README.md | 8 + 4 files changed, 586 insertions(+) create mode 100644 notebooks/search/09-semantic-text.ipynb diff --git a/README.md b/README.md index 64e559d1..25439805 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,9 @@ Try out Playground in Kibana with the following notebooks: - [`04-multilingual.ipynb`](./notebooks/search/04-multilingual.ipynb) - [`05-query-rules.ipynb`](./notebooks/search/05-query-rules.ipynb) - [`06-synonyms-api.ipynb`](./notebooks/search/06-synonyms-api.ipynb) +- [`07-inference.ipynb`](./notebooks/search/07-inference.ipynb) +- [`08-learning-to-rank.ipynb`](./notebooks/search/08-learning-to-rank.ipynb) +- [`09-semantic-text.ipynb`](./notebooks/search/09-semantic-text.ipynb) ### Integrations diff --git a/bin/find-notebooks-to-test.sh b/bin/find-notebooks-to-test.sh index 9875eeb7..9fb99b06 100755 --- a/bin/find-notebooks-to-test.sh +++ b/bin/find-notebooks-to-test.sh @@ -47,6 +47,11 @@ EXEMPT_NOTEBOOKS__8_12=( "notebooks/langchain/langchain-using-own-model.ipynb" ) +EXEMPT_NOTEBOOKS__8_14=( + # Add any notebooks that must be skipped on versions 8.14 or older here + "notebooks/search/09-semantic-text.ipynb" +) + # this function parses a version given as M[.N[.P]] or M[_N[_P]] into a numeric form function parse_version { echo "$@" | awk -F'[._]' '{ printf("%02d%02d\n", $1, $2); }'; } diff --git a/notebooks/search/09-semantic-text.ipynb b/notebooks/search/09-semantic-text.ipynb new file mode 100644 index 00000000..de539ea0 --- /dev/null +++ b/notebooks/search/09-semantic-text.ipynb @@ -0,0 +1,570 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Semantic Search with Semantic Text\n", + "\n", + "\"Open\n", + "\n", + "Learn how to use the [semantic_text](https://www.elastic.co/guide/en/elasticsearch/reference/master/semantic-text.html) field type to quickly get started with semantic search." + ], + "metadata": { + "collapsed": false + }, + "id": "c2907fddfeac343a" + }, + { + "cell_type": "markdown", + "source": [ + "## Requirements\n", + "\n", + "For this example, you will need:\n", + "\n", + "- An Elastic deployment:\n", + " - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook))\n", + "\n", + "- Elasticsearch 8.15 or above, or [Elasticsearch serverless](https://www.elastic.co/elasticsearch/serverless)" + ], + "metadata": { + "collapsed": false + }, + "id": "3db37d2cf8264468" + }, + { + "cell_type": "markdown", + "source": [ + "## Create Elastic Cloud deployment\n", + "\n", + "If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial." + ], + "metadata": { + "collapsed": false + }, + "id": "7fe1ed0703a8d1d3" + }, + { + "cell_type": "markdown", + "source": [ + "## Install packages and connect with Elasticsearch Client\n", + "\n", + "To get started, we'll need to connect to our Elastic deployment using the Python client (version 8.15.0 or above).\n", + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n", + "\n", + "First we need to `pip` install the following packages:\n", + "\n", + "- `elasticsearch`" + ], + "metadata": { + "collapsed": false + }, + "id": "f9c8bd62c8241f90" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!pip install elasticsearch" + ], + "metadata": { + "collapsed": false + }, + "id": "13fdf7656ced2da3" + }, + { + "cell_type": "markdown", + "source": [ + "Next, we need to import the modules we need. \n", + "\n", + "🔐 NOTE: getpass enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory." + ], + "metadata": { + "collapsed": false + }, + "id": "9d54b112361d2f3d" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch, exceptions\n", + "from urllib.request import urlopen\n", + "from getpass import getpass\n", + "import json\n", + "import time" + ], + "metadata": { + "collapsed": false + }, + "id": "9a60627704e77ff6" + }, + { + "cell_type": "markdown", + "source": [ + "Now we can instantiate the Python Elasticsearch client.\n", + "\n", + "First we prompt the user for their password and Cloud ID.\n", + "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." + ], + "metadata": { + "collapsed": false + }, + "id": "eb9498124146d8bb" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", + "# Create the client instance\n", + "client = Elasticsearch(\n", + " # For local development\n", + " # hosts=[\"http://localhost:9200\"]\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")" + ], + "metadata": { + "collapsed": false + }, + "id": "6e14437dcce0f235" + }, + { + "cell_type": "markdown", + "source": [ + "### Enable Telemetry\n", + "\n", + "Knowing that you are using this notebook helps us decide where to invest our efforts to improve our products. We would like to ask you that you run the following code to let us gather anonymous usage statistics. See [telemetry.py](https://github.com/elastic/elasticsearch-labs/blob/main/telemetry/telemetry.py) for details. Thank you!" + ], + "metadata": { + "collapsed": false + }, + "id": "89b6b7721f6d8599" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!curl -O -s https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/telemetry/telemetry.py\n", + "from telemetry import enable_telemetry\n", + "\n", + "client = enable_telemetry(client, \"09-semantic-text\")" + ], + "metadata": { + "collapsed": false + }, + "id": "5a7af618fb61f358" + }, + { + "cell_type": "markdown", + "source": [ + "### Test the Client\n", + "Before you continue, confirm that the client has connected with this test." + ], + "metadata": { + "collapsed": false + }, + "id": "cbbdaf9118a97732" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(client.info())" + ], + "metadata": { + "collapsed": false + }, + "id": "4cb0685fae12e034" + }, + { + "cell_type": "markdown", + "source": [ + "Refer to [the documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#connect-self-managed-new) to learn how to connect to a self-managed deployment.\n", + "\n", + "Read [this page](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#connect-self-managed-new) to learn how to connect using API keys." + ], + "metadata": { + "collapsed": false + }, + "id": "59e2223bf2c4331" + }, + { + "cell_type": "markdown", + "source": [ + "## Create the Inference Endpoint\n", + "\n", + "Let's create the inference endpoint by using the [Create inference API](https://www.elastic.co/guide/en/elasticsearch/reference/current/put-inference-api.html).\n", + "\n", + "For this example we'll use the [ELSER service](https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html), but the inference API also supports [many other inference services](https://www.elastic.co/guide/en/elasticsearch/reference/current/put-inference-api.html#put-inference-api-desc)." + ], + "metadata": { + "collapsed": false + }, + "id": "22fa643780acd44a" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "try:\n", + " client.inference.delete_model(inference_id=\"my-elser-endpoint\")\n", + "except exceptions.NotFoundError:\n", + " # Inference endpoint does not exist\n", + " pass\n", + "\n", + "try:\n", + " client.options(\n", + " request_timeout=60, max_retries=3, retry_on_timeout=True\n", + " ).inference.put_model(\n", + " task_type=\"sparse_embedding\",\n", + " inference_id=\"my-elser-endpoint\",\n", + " body={\n", + " \"service\": \"elser\",\n", + " \"service_settings\": {\"num_allocations\": 1, \"num_threads\": 1},\n", + " },\n", + " )\n", + " print(\"Inference endpoint created successfully\")\n", + "except exceptions.BadRequestError as e:\n", + " if e.error == \"resource_already_exists_exception\":\n", + " print(\"Inference endpoint created successfully\")\n", + " else:\n", + " raise e" + ], + "metadata": { + "collapsed": false + }, + "id": "8ee2188ea71324f5" + }, + { + "cell_type": "markdown", + "source": [ + "Once the endpoint is created, we must wait until the backing ELSER service is deployed.\n", + "This can take a few minutes to complete." + ], + "metadata": { + "collapsed": false + }, + "id": "e94fd66761fd8087" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "inference_endpoint_info = client.inference.get_model(\n", + " inference_id=\"my-elser-endpoint\",\n", + ")\n", + "model_id = inference_endpoint_info[\"endpoints\"][0][\"service_settings\"][\"model_id\"]\n", + "\n", + "while True:\n", + " status = client.ml.get_trained_models_stats(\n", + " model_id=model_id,\n", + " )\n", + "\n", + " deployment_stats = status[\"trained_model_stats\"][0].get(\"deployment_stats\")\n", + " if deployment_stats is None:\n", + " print(\"ELSER Model is currently being deployed.\")\n", + " time.sleep(5)\n", + " continue\n", + "\n", + " nodes = deployment_stats.get(\"nodes\")\n", + " if nodes is not None and len(nodes) > 0:\n", + " print(\"ELSER Model has been successfully deployed.\")\n", + " break\n", + " else:\n", + " print(\"ELSER Model is currently being deployed.\")\n", + " time.sleep(5)" + ], + "metadata": { + "collapsed": false + }, + "id": "adb33329ce20b2f1" + }, + { + "cell_type": "markdown", + "source": [ + "## Create the Index\n", + "\n", + "Now we need to create an index with a `semantic_text` field. Let's create one that enables us to perform semantic search on movie plots." + ], + "metadata": { + "collapsed": false + }, + "id": "818f7a72a83b5776" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "client.indices.delete(index=\"semantic-text-movies\", ignore_unavailable=True)\n", + "client.indices.create(\n", + " index=\"semantic-text-movies\",\n", + " mappings={\n", + " \"properties\": {\n", + " \"title\": {\"type\": \"text\"},\n", + " \"genre\": {\"type\": \"text\"},\n", + " \"plot\": {\"type\": \"text\", \"copy_to\": \"plot_semantic\"},\n", + " \"plot_semantic\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": \"my-elser-endpoint\",\n", + " },\n", + " }\n", + " },\n", + ")" + ], + "metadata": { + "collapsed": false + }, + "id": "ace87760606f67c6" + }, + { + "cell_type": "markdown", + "source": [ + "Notice how we configured the mappings. We defined `plot_semantic` as a `semantic_text` field.\n", + "The `inference_id` parameter defines the inference endpoint that is used to generate the embeddings for the field.\n", + "Then we configured the `plot` field to [copy its value](https://www.elastic.co/guide/en/elasticsearch/reference/current/copy-to.html) to the `plot_semantic` field." + ], + "metadata": { + "collapsed": false + }, + "id": "abc3ee7a1fddfa9b" + }, + { + "cell_type": "markdown", + "source": [ + "## Populate the Index\n", + "\n", + "Let's populate the index with our example dataset of 12 movies." + ], + "metadata": { + "collapsed": false + }, + "id": "2b5a46b60660a489" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/movies.json\"\n", + "response = urlopen(url)\n", + "movies = json.loads(response.read())\n", + "\n", + "operations = []\n", + "for movie in movies:\n", + " operations.append({\"index\": {\"_index\": \"semantic-text-movies\"}})\n", + " operations.append(movie)\n", + "client.bulk(index=\"semantic-text-movies\", operations=operations, refresh=True)" + ], + "metadata": { + "collapsed": false + }, + "id": "24f0133923553d28" + }, + { + "cell_type": "markdown", + "source": [ + "## Semantic Search\n", + "\n", + "Now that our index is populated, we can query it using semantic search.\n", + "\n", + "### Aside: Pretty printing Elasticsearch search results\n", + "\n", + "Your `search` API calls will return hard-to-read nested JSON.\n", + "We'll create a little function called `pretty_search_response` to return nice, human-readable outputs from our examples." + ], + "metadata": { + "collapsed": false + }, + "id": "6fff5932fcbac1b0" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "def pretty_search_response(response):\n", + " if len(response[\"hits\"][\"hits\"]) == 0:\n", + " print(\"Your search returned no results.\")\n", + " else:\n", + " for hit in response[\"hits\"][\"hits\"]:\n", + " id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " runtime = hit[\"_source\"][\"runtime\"]\n", + " plot = hit[\"_source\"][\"plot\"]\n", + " keyScene = hit[\"_source\"][\"keyScene\"]\n", + " genre = hit[\"_source\"][\"genre\"]\n", + " released = hit[\"_source\"][\"released\"]\n", + "\n", + " pretty_output = f\"\\nID: {id}\\nScore: {score}\\nTitle: {title}\\nRuntime: {runtime}\\nPlot: {plot}\\nKey Scene: {keyScene}\\nGenre: {genre}\\nReleased: {released}\"\n", + "\n", + " print(pretty_output)" + ], + "metadata": { + "collapsed": false + }, + "id": "ad417b4b3f50c889" + }, + { + "cell_type": "markdown", + "source": [ + "### Semantic Search with the `semantic` Query\n", + "\n", + "We can use the [`semantic` query](https://www.elastic.co/guide/en/elasticsearch/reference/master/query-dsl-semantic-query.html) to quickly & easily query the `semantic_text` field in our index.\n", + "Under the hood, an embedding is automatically generated for our query text using the `semantic_text` field's inference endpoint." + ], + "metadata": { + "collapsed": false + }, + "id": "22c4d4d395adb472" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "response = client.search(\n", + " index=\"semantic-text-movies\",\n", + " query={\"semantic\": {\"field\": \"plot_semantic\", \"query\": \"organized crime movies\"}},\n", + ")\n", + "\n", + "pretty_search_response(response)" + ], + "metadata": { + "collapsed": false + }, + "id": "1a8520ffc8a3efb3" + }, + { + "cell_type": "markdown", + "source": [ + "These results demonstrate the power of semantic search.\n", + "Our top results are all movies involving organized crime, even if the exact term \"organized crime\" doesn't appear in the plot description.\n", + "This works because the ELSER model understands the semantic similarity between terms like \"organized crime\" and \"mob\".\n", + "\n", + "However, these results also show the weaknesses of semantic search.\n", + "Because semantic search is based on vector similarity, there is a long tail of results that are weakly related to our query vector.\n", + "That's why movies like _The Matrix_ are returned towards the tail end of our search results." + ], + "metadata": { + "collapsed": false + }, + "id": "148fda24a3964aa9" + }, + { + "cell_type": "markdown", + "source": [ + "### Hybrid Search with the `semantic` Query\n", + "\n", + "We can address some of the issues with pure semantic search by combining it with lexical search techniques.\n", + "Here, we use a [boolean query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html) to require that all matches contain at least term from the query text, in either the `plot` or `genre` fields." + ], + "metadata": { + "collapsed": false + }, + "id": "7c9bab225a745746" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "response = client.search(\n", + " index=\"semantic-text-movies\",\n", + " query={\n", + " \"bool\": {\n", + " \"must\": {\n", + " \"multi_match\": {\n", + " \"fields\": [\"plot\", \"genre\"],\n", + " \"query\": \"organized crime movies\",\n", + " \"boost\": 1.5,\n", + " }\n", + " },\n", + " \"should\": {\n", + " \"semantic\": {\n", + " \"field\": \"plot_semantic\",\n", + " \"query\": \"organized crime movies\",\n", + " \"boost\": 3.0,\n", + " }\n", + " },\n", + " }\n", + " },\n", + ")\n", + "\n", + "pretty_search_response(response)" + ], + "metadata": { + "collapsed": false + }, + "id": "4f72f7906b918dc1" + }, + { + "cell_type": "markdown", + "source": [ + "These results demonstrate that the application of lexical search techniques can help focus the results, while retaining many of the advantages of semantic search.\n", + "In this example, the top search results are all still movies involving organized crime, but the `multi_match` query keeps the long tail shorter and focused on movies in the crime genre.\n", + "\n", + "Note the `boost` parameters applied to the `multi_match` and `semantic` queries.\n", + "Combining lexical and semantic search techniques in a boolean query like this is called \"linear combination\" and when doing this, it is important to normalize the scores of the component queries.\n", + "This involves consideration of a few factors, including:\n", + "\n", + "- The range of scores generated by the query\n", + "- The relative importance and accuracy of the query in the context of the dataset\n", + "\n", + "In this example, the `multi_match` query is mostly used as a filter to constrain the search results' long tail, so we assign it a lower boost than the `semantic` query." + ], + "metadata": { + "collapsed": false + }, + "id": "d50d10ced4389107" + }, + { + "cell_type": "markdown", + "source": [ + "## Conclusion\n", + "\n", + "The [semantic_text](https://www.elastic.co/guide/en/elasticsearch/reference/master/semantic-text.html) field type is a powerful tool that can help you quickly and easily integrate semantic search.\n", + "It can greatly improve the relevancy of your search results, particularly when combined with lexical search techniques." + ], + "metadata": { + "collapsed": false + }, + "id": "78be304240d6c695" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/search/README.md b/notebooks/search/README.md index c6f4c4fe..8e9b991a 100644 --- a/notebooks/search/README.md +++ b/notebooks/search/README.md @@ -82,3 +82,11 @@ In the [`06-synonyms-api.ipynb`](./06-synonyms-api.ipynb) notebook, you'll learn - Configure an index to use search-time synonyms - Update synonyms in real time - Run queries that are enhanced by synonyms + +### 9. Semantic text + +In the [`09-semantic-text.ipynb`](./09-semantic-text.ipynb) notebook, you'll learn how to: + +- Quickly get started with semantic search with the `semantic_text` field type +- Use the `semantic` query +- Implement hybrid search