From 63684d29f247b3ece1ec26b1f0e346c9aa85abf7 Mon Sep 17 00:00:00 2001 From: Rado Ondas Date: Thu, 3 Aug 2023 15:33:09 +0200 Subject: [PATCH 1/3] Adds collab book for Image similarity --- notebooks/search/04-image-similarity.ipynb | 1264 ++++++++++++++++++++ 1 file changed, 1264 insertions(+) create mode 100644 notebooks/search/04-image-similarity.ipynb diff --git a/notebooks/search/04-image-similarity.ipynb b/notebooks/search/04-image-similarity.ipynb new file mode 100644 index 00000000..b5069481 --- /dev/null +++ b/notebooks/search/04-image-similarity.ipynb @@ -0,0 +1,1264 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# How to implement Image search using Elasticsearch" + ], + "metadata": { + "id": "CepGq3Kvtdxi" + } + }, + { + "cell_type": "markdown", + "source": [ + "The workbook shows how to implement an Image search using Elasticsearch. You will index documents with image embeddings (generated or pre-generated) and then using NLP model be able to search using natural language description of the image.\n", + "\n", + "### Prerequisities\n", + "Before you start make sure you have Elasticsearch cluster running. The cluster must have at least one machine learning (ML) node with enough (4GB) memory." + ], + "metadata": { + "id": "oMu1SW_TQQrU" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Install Python requirements\n", + "Before you start you need to install all required Python dependencies." + ], + "metadata": { + "id": "VFcdr8IDQE_H" + } + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6WosfR55npKU", + "outputId": "033767ff-0eef-48cc-c9e7-efbf73c9cb67" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", + "Requirement already satisfied: eland in /usr/local/lib/python3.10/dist-packages (8.7.0)\n", + "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.9.0)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.31.0)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.65.0)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (9.4.0)\n", + "Requirement already satisfied: streamlit in /usr/local/lib/python3.10/dist-packages (1.25.0)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.15.2+cu118)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.22.4)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.10.1)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n", + "Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.4)\n", + "Requirement already satisfied: pandas>=1.5 in /usr/local/lib/python3.10/dist-packages (from eland) (1.5.3)\n", + "Requirement already satisfied: matplotlib>=3.6 in /usr/local/lib/python3.10/dist-packages (from eland) (3.7.1)\n", + "Requirement already satisfied: elastic-transport<9,>=8 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.4.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.1)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.7.1)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.6)\n", + "Requirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.2.2)\n", + "Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit) (1.4)\n", + "Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.3.1)\n", + "Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.1.6)\n", + "Requirement already satisfied: importlib-metadata<7,>=1.4 in /usr/lib/python3/dist-packages (from streamlit) (4.6.4)\n", + "Requirement already satisfied: protobuf<5,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.20.3)\n", + "Requirement already satisfied: pyarrow>=6.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.0.0)\n", + "Requirement already satisfied: pympler<2,>=0.9 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.0.1)\n", + "Requirement already satisfied: python-dateutil<3,>=2.7.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.8.2)\n", + "Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (13.4.2)\n", + "Requirement already satisfied: tenacity<9,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.2.2)\n", + "Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.10.2)\n", + "Requirement already satisfied: tzlocal<5,>=1.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.3.1)\n", + "Requirement already satisfied: validators<1,>=0.2 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.20.0)\n", + "Requirement already satisfied: gitpython!=3.1.19,<4,>=3.0.7 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.1.32)\n", + "Requirement already satisfied: pydeck<1,>=0.8 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.8.0)\n", + "Requirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.3.1)\n", + "Requirement already satisfied: watchdog>=2.1.5 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.0.0)\n", + "Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.4)\n", + "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (4.3.3)\n", + "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.12.0)\n", + "Requirement already satisfied: urllib3<2,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (1.26.16)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2023.7.22)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from gitpython!=3.1.19,<4,>=3.0.7->streamlit) (4.0.10)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.6.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (4.41.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (3.1.0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.5->eland) (2022.7.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3,>=2.7.3->streamlit) (1.16.0)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (2.14.0)\n", + "Requirement already satisfied: pytz-deprecation-shim in /usr/local/lib/python3.10/dist-packages (from tzlocal<5,>=1.1->streamlit) (0.1.0.post0)\n", + "Requirement already satisfied: decorator>=3.4.0 in /usr/local/lib/python3.10/dist-packages (from validators<1,>=0.2->streamlit) (4.4.2)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit) (5.0.0)\n", + "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (23.1.0)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.19.3)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit) (0.1.2)\n", + "Requirement already satisfied: tzdata in /usr/local/lib/python3.10/dist-packages (from pytz-deprecation-shim->tzlocal<5,>=1.1->streamlit) (2023.3)\n" + ] + } + ], + "source": [ + "!pip install sentence-transformers eland elasticsearch transformers torch tqdm Pillow streamlit" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Upload NLP model for querying\n", + "Using an `eland` library you will import NLP CLIP model. The model will tran\n", + "sfer your search query into vector which will be used for the search over the set of images stored in Elasticsearch.\n", + "\n", + "The model used is [clip-ViT-B-32-multilingual-v1](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1) because the image embeddings are also generated by the CLIP model.\n", + "\n", + "How to get cloud id? Go to [ESS cloud](https://cloud.elastic.co/logout?redirectTo=%2Fhome&reason=unauthorised) and `On the deployment overview page, copy down the Cloud ID.`\n", + "\n", + "The authentication is using api key (`--es-api-key`). Learn how to generate [API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html#create-api-key).\n", + "```\n", + "$ eland_import_hub_model --cloud-id $CLOUD_ID \\\n", + " --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 \\\n", + " --task-type text_embedding --es-api-key $API_KEY --start\n", + "```" + ], + "metadata": { + "id": "eIV5lAnVt9L7" + } + }, + { + "cell_type": "code", + "source": [ + "API_KEY=''\n", + "CLOUD_ID=''\n", + "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 --task-type text_embedding --es-api-key API_KEY --start" + ], + "metadata": { + "id": "tVhL9jBnuAAQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Import Python libraries" + ], + "metadata": { + "id": "hVxWnFflUCZv" + } + }, + { + "cell_type": "code", + "source": [ + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.helpers import parallel_bulk\n", + "import requests\n", + "import os\n", + "import sys\n", + "# import shutil\n", + "import zipfile\n", + "from tqdm.auto import tqdm\n", + "import pandas as pd\n", + "from PIL import Image\n", + "from sentence_transformers import SentenceTransformer\n", + "import urllib.request\n", + "# import urllib.error\n", + "import json\n", + "from getpass import getpass" + ], + "metadata": { + "id": "I0pRCbYMuMVn" + }, + "execution_count": 17, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Connect to Elasticsearch cluster\n", + "Use your own cluster details `ELASTIC_CLOUD_ID`, `API_KEY`." + ], + "metadata": { + "id": "Klv3rywdUJBN" + } + }, + { + "cell_type": "code", + "source": [ + "# ESS Cloud connection definition using an API_KEY\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "API_KEY = getpass(\"Elastic API key: \")\n", + "\n", + "# ELASTIC_CLOUD_USER = \"elastic\"\n", + "# CLOUD_PASSWORD = getpass(\"Elastic Password\")\n", + "\n", + "es = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " #basic_auth=(ELASTIC_CLOUD_USER, ELASTIC_CLOUD_PASSWORD),\n", + " api_key=API_KEY,\n", + " request_timeout=600\n", + ")\n", + "\n", + "es.info() # should return cluster info" + ], + "metadata": { + "id": "YwN8RmFY3FQI", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d0d0e31e-2ad2-46fe-ef8c-8c8bce7e1c48" + }, + "execution_count": 19, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elastic Cloud ID: ··········\n", + "Elastic API key: ··········\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "ObjectApiResponse({'name': 'instance-0000000000', 'cluster_name': 'a597bbe1e0d047c494e7d4015f67ef37', 'cluster_uuid': 'EnT0vwwSSZeAahPw3Vhsuw', 'version': {'number': '8.8.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98e1271edf932a480e4262a471281f1ee295ce6b', 'build_date': '2023-06-26T05:16:16.196344851Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create Index and mappings for Images\n", + "Befor you can index documents into Elasticsearch, you need to create an Index with correct mappings." + ], + "metadata": { + "id": "IW-GIlH2OxB4" + } + }, + { + "cell_type": "code", + "source": [ + "# Destination Index name\n", + "INDEX_NAME=\"images\"\n", + "# If you want to delete previous version of the Index\n", + "DELETE_INDEX=False\n", + "\n", + "INDEX_MAPPING = {\n", + " \"properties\": {\n", + " \"image_embedding\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 512,\n", + " \"index\": True,\n", + " \"similarity\": \"cosine\"\n", + " },\n", + " \"photo_id\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"photo_image_url\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"ai_description\": {\n", + " \"type\": \"text\"\n", + " },\n", + " \"photo_description\": {\n", + " \"type\": \"text\"\n", + " },\n", + " \"photo_url\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"photographer_first_name\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"photographer_last_name\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"photographer_username\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"exif_camera_make\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"exif_camera_model\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"exif_iso\": {\n", + " \"type\": \"integer\"\n", + " }\n", + " }\n", + " }\n", + "\n", + "# Index settings\n", + "INDEX_SETTINGS = {\n", + " \"index\": {\n", + " \"number_of_replicas\": \"1\",\n", + " \"number_of_shards\": \"1\",\n", + " \"refresh_interval\": \"5s\"\n", + " }\n", + "}\n", + "\n", + "if(DELETE_INDEX):\n", + " if es.indices.exists(index=INDEX_NAME):\n", + " print(\"Deleting existing %s\" % INDEX_NAME)\n", + " es.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", + "\n", + "if not es.indices.exists(index=INDEX_NAME):\n", + " print(\"Creating index %s\" % INDEX_NAME)\n", + " es.indices.create(index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS,\n", + " ignore=[400, 404])\n" + ], + "metadata": { + "id": "xAkc1OVcOxy3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Get image dataset and embeddings\n", + "Download:\n", + "- The example image dataset is from [Unsplash](https://github.com/unsplash/datasets)\n", + "- The [Image embeddings](https://github.com/radoondas/flask-elastic-nlp/blob/main/embeddings/blogs/blogs-no-embeddings.json.zip) are pre-generated using CLIP model\n", + "\n", + "Then unzip both files." + ], + "metadata": { + "id": "NKE-j0kPUMn_" + } + }, + { + "cell_type": "code", + "source": [ + "!wget https://unsplash.com/data/lite/1.2.0 -O data/unsplash-research-dataset-lite-1.2.0.zip\n", + "!wget https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip -P data" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zFGaPDRR5mqT", + "outputId": "0114cdd6-a714-41ab-9b46-3013bd36698a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-07-25 14:28:32-- https://unsplash.com/data/lite/1.2.0\n", + "Resolving unsplash.com (unsplash.com)... 151.101.65.181, 151.101.1.181, 151.101.129.181, ...\n", + "Connecting to unsplash.com (unsplash.com)|151.101.65.181|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip [following]\n", + "--2023-07-25 14:28:32-- https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip\n", + "Resolving unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)... 52.217.102.84, 3.5.25.253, 52.217.96.188, ...\n", + "Connecting to unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)|52.217.102.84|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 632351052 (603M) [application/zip]\n", + "Saving to: ‘data/unsplash-research-dataset-lite-1.2.0.zip’\n", + "\n", + "data/unsplash-resea 100%[===================>] 603.06M 14.1MB/s in 42s \n", + "\n", + "2023-07-25 14:29:16 (14.2 MB/s) - ‘data/unsplash-research-dataset-lite-1.2.0.zip’ saved [632351052/632351052]\n", + "\n", + "--2023-07-25 14:29:16-- https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 94256217 (90M) [application/zip]\n", + "Saving to: ‘data/image-embeddings.json.zip.1’\n", + "\n", + "image-embeddings.js 100%[===================>] 89.89M 164MB/s in 0.5s \n", + "\n", + "2023-07-25 14:29:16 (164 MB/s) - ‘data/image-embeddings.json.zip.1’ saved [94256217/94256217]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Unzip downloaded files\n", + "UNSPLASH_ZIP_FILE=\"data/unsplash-research-dataset-lite-1.2.0.zip\"\n", + "EMBEDDINGS_ZIP_FILE=\"data/image-embeddings.json.zip\"\n", + "\n", + "with zipfile.ZipFile(UNSPLASH_ZIP_FILE, 'r') as zip_ref:\n", + " print('Extracting file ', UNSPLASH_ZIP_FILE, '.')\n", + " zip_ref.extractall('data/unsplash/')\n", + "\n", + "with zipfile.ZipFile(EMBEDDINGS_ZIP_FILE, 'r') as zip_ref:\n", + " print('Extracting file ', EMBEDDINGS_ZIP_FILE, '.')\n", + " zip_ref.extractall(\"data/embeddings/\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MBh4AQ8i7C0-", + "outputId": "17a50b7f-f052-4b72-daa8-0e8fc630326f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Extracting file data/unsplash-research-dataset-lite-1.2.0.zip .\n", + "Extracting file data/image-embeddings.json.zip .\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Image embeddings\n", + "We have now 2 options how to proceed.\n", + "1. Import all pregenerated image embeddings (~19k). This is faster option with a lot of images available in a short time.\n", + "2. Import a small subset of randomly choosen images to see the process of generating of image embeddings using external Clip model." + ], + "metadata": { + "id": "p6H7QYctQQA7" + } + }, + { + "cell_type": "code", + "source": [ + "# define helper function\n", + "def gen_rows(df):\n", + " for doc in df.to_dict(orient='records'):\n", + " yield doc" + ], + "metadata": { + "id": "03YvC-_JY9OE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 1. Import all pregenerated image embeddings\n", + "This option lets you inport ~19k documents woth pregenenerated image embeddings with metadata.\n", + "\n", + "The process downloads files with images information, merge them and index into Elasticsearch." + ], + "metadata": { + "id": "qhZRdUyAQd-s" + } + }, + { + "cell_type": "code", + "source": [ + "df_unsplash = pd.read_csv('data/unsplash/' + 'photos.tsv000', sep='\\t', header=0)\n", + "\n", + "# follwing 8 lines are fix for inconsistent/incorrect data\n", + "df_unsplash['photo_description'].fillna('', inplace=True)\n", + "df_unsplash['ai_description'].fillna('', inplace=True)\n", + "df_unsplash['photographer_first_name'].fillna('', inplace=True)\n", + "df_unsplash['photographer_last_name'].fillna('', inplace=True)\n", + "df_unsplash['photographer_username'].fillna('', inplace=True)\n", + "df_unsplash['exif_camera_make'].fillna('', inplace=True)\n", + "df_unsplash['exif_camera_model'].fillna('', inplace=True)\n", + "df_unsplash['exif_iso'].fillna(0, inplace=True)\n", + "## end of fix\n", + "\n", + "# read subset of columns from the original/downloaded dataset\n", + "df_unsplash_subset = df_unsplash[\n", + " ['photo_id', 'photo_url', 'photo_image_url', 'photo_description', 'ai_description', 'photographer_first_name',\n", + " 'photographer_last_name', 'photographer_username', 'exif_camera_make', 'exif_camera_model', 'exif_iso']]\n", + "\n", + "# read all pregenerated embeddings\n", + "df_embeddings = pd.read_json('data/embeddings/' + 'image-embeddings.json', lines=True)\n", + "\n", + "df_merged = pd.merge(df_unsplash_subset, df_embeddings,\n", + " on='photo_id',\n", + " how='inner')\n", + "\n", + "count = 0\n", + "for success, info in parallel_bulk(\n", + " client=es,\n", + " actions=gen_rows(df_merged),\n", + " thread_count=5,\n", + " chunk_size=1000,\n", + " index=INDEX_NAME\n", + "):\n", + " if success:\n", + " count += 1\n", + " if count % 1000 == 0:\n", + " print('Indexed %s documents' % str(count), flush=True)\n", + " sys.stdout.flush()\n", + " else:\n", + " print('Doc failed', info)\n", + "\n", + "print('Indexed %s image embeddings documents' % str(count), flush=True)\n", + "sys.stdout.flush()" + ], + "metadata": { + "id": "32xrbSUXTODQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 2. Import small number of random\n", + "In this part you will randomly choose small number of images and then generate image embeddings. The script will then index documents into Elasticserach." + ], + "metadata": { + "id": "xypgh4lFQmkc" + } + }, + { + "cell_type": "code", + "source": [ + "NUMBER_OF_IMAGES=20\n", + "INDEX_NAME=\"images-test\"\n", + "\n", + "df_unsplash = pd.read_csv('data/unsplash/' + 'photos.tsv000', sep='\\t', header=0)\n", + "\n", + "## stat fix\n", + "# follwing 8 lines are fix for inconsistent/incorrect data\n", + "df_unsplash['photo_description'].fillna('', inplace=True)\n", + "df_unsplash['ai_description'].fillna('', inplace=True)\n", + "df_unsplash['photographer_first_name'].fillna('', inplace=True)\n", + "df_unsplash['photographer_last_name'].fillna('', inplace=True)\n", + "df_unsplash['photographer_username'].fillna('', inplace=True)\n", + "df_unsplash['exif_camera_make'].fillna('', inplace=True)\n", + "df_unsplash['exif_camera_model'].fillna('', inplace=True)\n", + "df_unsplash['exif_iso'].fillna(0, inplace=True)\n", + "## end of fix\n", + "\n", + "df_unsplash_subset = df_unsplash[\n", + " ['photo_id', 'photo_url', 'photo_image_url', 'photo_description', 'ai_description', 'photographer_first_name',\n", + " 'photographer_last_name', 'photographer_username', 'exif_camera_make', 'exif_camera_model', 'exif_iso']]\n", + "\n", + "df_random_subset = df_unsplash_subset.sample(n=NUMBER_OF_IMAGES, replace=False)\n", + "df_random_subset = df_random_subset.reset_index()\n", + "\n", + "# Load model CLIP\n", + "img_model = SentenceTransformer('clip-ViT-B-32')\n", + "\n", + "# new list of image documents for indexing into ES\n", + "lst = []\n", + "if not os.path.exists(\"data/images\"):\n", + " os.mkdir(\"data/images\")\n", + "\n", + "for index, row in df_random_subset.iterrows():\n", + " #open image from url\n", + " img_path = \"data/images/\" + row['photo_id']\n", + " try:\n", + " urllib.request.urlretrieve(row['photo_image_url'], img_path)\n", + " print(row['photo_id'] + \" \" + row['photo_url'])\n", + " except urllib.error.HTTPError as err:\n", + " if err.code == 404:\n", + " print('404 error: Image not found at {}'.format(row['photo_image_url']))\n", + " else:\n", + " raise\n", + "\n", + " img = Image.open(img_path)\n", + " # create doc\n", + " doc = {}\n", + " embedding = img_model.encode(img)\n", + " doc['photo_id'] = row['photo_id']\n", + " doc['image_embedding'] = embedding.tolist()\n", + " lst.append(doc)\n", + " # print(doc)\n", + "\n", + " # Image cleanup.\n", + " # If file exists, delete it.\n", + " if os.path.exists(img_path):\n", + " os.remove(img_path)\n", + "\n", + "# read all pregenerated embeddings\n", + "df_embeddings = pd.read_json('data/embeddings/' + 'image-embeddings.json', lines=True)\n", + "\n", + "df_merged = pd.merge(df_random_subset, pd.DataFrame(lst),\n", + " on='photo_id',\n", + " how='inner')\n", + "# print(df_merged)\n", + "\n", + "count = 0\n", + "for success, info in parallel_bulk(\n", + " client=es,\n", + " actions=gen_rows(df_merged),\n", + " thread_count=5,\n", + " chunk_size=10,\n", + " index=INDEX_NAME\n", + "):\n", + " if success:\n", + " count += 1\n", + " if count % 10 == 0:\n", + " print('Indexed %s documents' % str(count), flush=True)\n", + " sys.stdout.flush()\n", + " else:\n", + " print('Doc failed', info)\n", + "\n", + "print('Indexed %s image embeddings documents' % str(count), flush=True)\n", + "sys.stdout.flush()" + ], + "metadata": { + "id": "r_txQjP2RKnr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Query the image dataset\n", + "The next step is to run a query to search for images. The example query searches for `\"model_text\": \"Valentine day flowers\"` using the model we uploaded to Elasticsearch `sentence-transformers__clip-vit-b-32-multilingual-v1`.\n", + "\n", + "The process is one query even it internaly consists of two tasks. One is to tramsform your search text into a vector using the NLP model and the second task is to run the vector search over the image dataset.\n", + "```\n", + "POST images/_search\n", + "{\n", + " \"knn\": {\n", + " \"field\": \"image_embedding\",\n", + " \"k\": 5,\n", + " \"num_candidates\": 10,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__clip-vit-b-32-multilingual-v1\",\n", + " \"model_text\": \"Valentine day flowers\"\n", + " }\n", + " }\n", + " },\n", + " \"fields\": [\n", + " \"photo_description\",\n", + " \"ai_description\",\n", + " \"photo_url\"\n", + " ],\n", + " \"_source\": false\n", + "}\n", + "```\n", + "\n" + ], + "metadata": { + "id": "-_i2CIpSz9vw" + } + }, + { + "cell_type": "code", + "source": [ + "# Search queary\n", + "WHAT_ARE_YOU_LOOKING_FOR=\"Valentine day flowers\"\n", + "INDEX_IM_EMBED=\"images\"\n", + "\n", + "source_fields = [\"photo_description\", \"ai_description\", \"photo_url\", \"photo_image_url\", \"photographer_first_name\",\n", + " \"photographer_username\", \"photographer_last_name\", \"photo_id\"]\n", + "query = {\n", + " \"field\": \"image_embedding\",\n", + " \"k\": 5,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__clip-vit-b-32-multilingual-v1\",\n", + " \"model_text\": WHAT_ARE_YOU_LOOKING_FOR\n", + " }\n", + " }\n", + "}\n", + "\n", + "response = es.search(\n", + " index=INDEX_IM_EMBED,\n", + " fields=source_fields,\n", + " knn=query, source=False)\n", + "\n", + "print(response.body)\n", + "\n", + "# the code writes the response into a file for the streamlit UI used in the optional step.\n", + "with open('json_data.json', 'w') as outfile:\n", + " json.dump(response.body['hits']['hits'], outfile)\n", + "\n", + "# Use the `loads()` method to load the JSON data\n", + "dfr = json.loads(json.dumps(response.body['hits']['hits']))\n", + "# Pass the generated JSON data into a pandas dataframe\n", + "dfr = pd.DataFrame(dfr)\n", + "# Print the data frame\n", + "dfr\n", + "\n", + "results = pd.json_normalize(json.loads(json.dumps(response.body['hits']['hits'])))\n", + "# results\n", + "results[['_id', '_score', 'fields.photo_id', 'fields.photo_image_url',\n", + " 'fields.photo_description', 'fields.photographer_first_name',\n", + " 'fields.photographer_last_name', 'fields.ai_description',\n", + " 'fields.photo_url']]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 375 + }, + "id": "wdicpvRlzmXG", + "outputId": "00550041-0aed-4f51-ccd3-18eb705ff7ed" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'took': 114, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 0.646751, 'hits': [{'_index': 'images', '_id': 'nK5Fh4kBLg4Kd5ySLbKC', '_score': 0.646751, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1587404787163-d03a28fcc1f0'], 'photo_id': ['gQFZxLe3m4g'], 'photographer_first_name': ['Vadim'], 'photo_description': ['instagram.com/vadimsadovski'], 'photographer_last_name': ['Sadovski'], 'photo_url': ['https://unsplash.com/photos/gQFZxLe3m4g'], 'photographer_username': ['vadimsadovski'], 'ai_description': ['']}}, {'_index': 'images', '_id': 'Xa5Eh4kBLg4Kd5yS84Qf', '_score': 0.64675057, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1574259148543-dd376a61339f'], 'photo_id': ['g5Mhx29yp-A'], 'photographer_first_name': ['Erin'], 'photo_description': ['Cute but grumpy cat in the Austrian mountains'], 'photographer_last_name': ['East'], 'photo_url': ['https://unsplash.com/photos/g5Mhx29yp-A'], 'photographer_username': ['mserineast'], 'ai_description': ['brown Persian cat on white bench']}}, {'_index': 'images', '_id': '265Eh4kBLg4Kd5yS84Uf', '_score': 0.64244866, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1565772101068-dec21f7b36eb'], 'photo_id': ['9KZ0PGNCxNE'], 'photographer_first_name': ['Perchek'], 'photo_description': [''], 'photographer_last_name': ['Industrie'], 'photo_url': ['https://unsplash.com/photos/9KZ0PGNCxNE'], 'photographer_username': ['perchek_industrie'], 'ai_description': ['siamese cat']}}, {'_index': 'images', '_id': 'xq5Fh4kBLg4Kd5ySEpuC', '_score': 0.64216036, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1589994205353-325f40210d02'], 'photo_id': ['VOZQkkKXvY4'], 'photographer_first_name': ['Andrey'], 'photo_description': [''], 'photographer_last_name': ['Svistunov'], 'photo_url': ['https://unsplash.com/photos/VOZQkkKXvY4'], 'photographer_username': ['svistal13'], 'ai_description': ['orange tabby cat on ground covered with snow during daytime']}}, {'_index': 'images', '_id': 'WK5Eh4kBLg4Kd5yS5XcD', '_score': 0.64185303, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1556820161-b605d166fce1'], 'photo_id': ['wmz8y6L6c_k'], 'photographer_first_name': ['Phillip'], 'photo_description': [''], 'photographer_last_name': ['Suitcases'], 'photo_url': ['https://unsplash.com/photos/wmz8y6L6c_k'], 'photographer_username': ['nillait'], 'ai_description': ['brown and black kitten close-up photography']}}]}}\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " _id _score fields.photo_id \\\n", + "0 nK5Fh4kBLg4Kd5ySLbKC 0.646751 [gQFZxLe3m4g] \n", + "1 Xa5Eh4kBLg4Kd5yS84Qf 0.646751 [g5Mhx29yp-A] \n", + "2 265Eh4kBLg4Kd5yS84Uf 0.642449 [9KZ0PGNCxNE] \n", + "3 xq5Fh4kBLg4Kd5ySEpuC 0.642160 [VOZQkkKXvY4] \n", + "4 WK5Eh4kBLg4Kd5yS5XcD 0.641853 [wmz8y6L6c_k] \n", + "\n", + " fields.photo_image_url \\\n", + "0 [https://images.unsplash.com/photo-15874047871... \n", + "1 [https://images.unsplash.com/photo-15742591485... \n", + "2 [https://images.unsplash.com/photo-15657721010... \n", + "3 [https://images.unsplash.com/photo-15899942053... \n", + "4 [https://images.unsplash.com/photo-1556820161-... \n", + "\n", + " fields.photo_description \\\n", + "0 [instagram.com/vadimsadovski] \n", + "1 [Cute but grumpy cat in the Austrian mountains] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + "\n", + " fields.photographer_first_name fields.photographer_last_name \\\n", + "0 [Vadim] [Sadovski] \n", + "1 [Erin] [East] \n", + "2 [Perchek] [Industrie] \n", + "3 [Andrey] [Svistunov] \n", + "4 [Phillip] [Suitcases] \n", + "\n", + " fields.ai_description \\\n", + "0 [] \n", + "1 [brown Persian cat on white bench] \n", + "2 [siamese cat] \n", + "3 [orange tabby cat on ground covered with snow ... \n", + "4 [brown and black kitten close-up photography] \n", + "\n", + " fields.photo_url \n", + "0 [https://unsplash.com/photos/gQFZxLe3m4g] \n", + "1 [https://unsplash.com/photos/g5Mhx29yp-A] \n", + "2 [https://unsplash.com/photos/9KZ0PGNCxNE] \n", + "3 [https://unsplash.com/photos/VOZQkkKXvY4] \n", + "4 [https://unsplash.com/photos/wmz8y6L6c_k] " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_id_scorefields.photo_idfields.photo_image_urlfields.photo_descriptionfields.photographer_first_namefields.photographer_last_namefields.ai_descriptionfields.photo_url
0nK5Fh4kBLg4Kd5ySLbKC0.646751[gQFZxLe3m4g][https://images.unsplash.com/photo-15874047871...[instagram.com/vadimsadovski][Vadim][Sadovski][][https://unsplash.com/photos/gQFZxLe3m4g]
1Xa5Eh4kBLg4Kd5yS84Qf0.646751[g5Mhx29yp-A][https://images.unsplash.com/photo-15742591485...[Cute but grumpy cat in the Austrian mountains][Erin][East][brown Persian cat on white bench][https://unsplash.com/photos/g5Mhx29yp-A]
2265Eh4kBLg4Kd5yS84Uf0.642449[9KZ0PGNCxNE][https://images.unsplash.com/photo-15657721010...[][Perchek][Industrie][siamese cat][https://unsplash.com/photos/9KZ0PGNCxNE]
3xq5Fh4kBLg4Kd5ySEpuC0.642160[VOZQkkKXvY4][https://images.unsplash.com/photo-15899942053...[][Andrey][Svistunov][orange tabby cat on ground covered with snow ...[https://unsplash.com/photos/VOZQkkKXvY4]
4WK5Eh4kBLg4Kd5yS5XcD0.641853[wmz8y6L6c_k][https://images.unsplash.com/photo-1556820161-...[][Phillip][Suitcases][brown and black kitten close-up photography][https://unsplash.com/photos/wmz8y6L6c_k]
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# [Optional] Simple streamlit UI\n", + "In the following section, you will view the response in a simple UI for better visualisation.\n", + "\n", + "The query in the previous step did write down a file response `json_data.json` for the UI to load and visualise.\n", + "\n", + "Follow the steps below to see the results in a table." + ], + "metadata": { + "id": "Ry62sfHFHFi9" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Install tunnel library" + ], + "metadata": { + "id": "iUAbRqr8II-x" + } + }, + { + "cell_type": "code", + "source": [ + "!npm install localtunnel" + ], + "metadata": { + "id": "RGEmAt2DjtN7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f6c37d54-7e09-4e59-fc21-8a3db4fa840d" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[K\u001b[?25h\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35msaveError\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[34;40mnotice\u001b[0m\u001b[35m\u001b[0m created a lockfile as package-lock.json. You should commit this file.\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35menoent\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No description\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No repository field.\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No README data\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No license field.\n", + "\u001b[0m\n", + "\u001b[K\u001b[?25h+ localtunnel@2.0.2\n", + "added 22 packages from 22 contributors and audited 22 packages in 5.903s\n", + "\n", + "3 packages are looking for funding\n", + " run `npm fund` for details\n", + "\n", + "found \u001b[92m0\u001b[0m vulnerabilities\n", + "\n", + "\u001b[K\u001b[?25h" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create application" + ], + "metadata": { + "id": "KUAfucnYITka" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile app.py\n", + "\n", + "import streamlit as st\n", + "import json\n", + "import pandas as pd\n", + "\n", + "\n", + "def get_image_preview(image_url):\n", + " \"\"\"Returns an HTML tag with preview of the image.\"\"\"\n", + " return f\"\"\"\"\"\"\n", + "\n", + "\n", + "def get_url_link(photo_url):\n", + " \"\"\"Returns an HTML tag to the image page.\"\"\"\n", + " return f\"\"\" {photo_url} \"\"\"\n", + "\n", + "\n", + "def main():\n", + " \"\"\"Creates a Streamlit app with a table of images.\"\"\"\n", + " data = json.load(open(\"json_data.json\"))\n", + " table = []\n", + " for image in data:\n", + " image_url = image[\"fields\"][\"photo_image_url\"][0]\n", + " image_preview = get_image_preview(image_url)\n", + " photo_url = image[\"fields\"][\"photo_url\"][0]\n", + " photo_url_link = get_url_link(photo_url)\n", + " table.append([image_preview, image[\"fields\"][\"photo_id\"][0],\n", + " image[\"fields\"][\"photographer_first_name\"][0],\n", + " image[\"fields\"][\"photographer_last_name\"][0],\n", + " image[\"fields\"][\"photographer_username\"][0],\n", + " photo_url_link])\n", + "\n", + " st.write(pd.DataFrame(table, columns=[\"Image\", \"ID\", \"First Name\", \"Last Name\",\n", + " \"Photographer username\", \"Photo url\"]).to_html(escape = False),\n", + " unsafe_allow_html=True)\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n", + "\n" + ], + "metadata": { + "id": "9Wb7GOWMXFnF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6db23ef3-b25e-4f80-a3cb-6d08c1c78c16" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting app.py\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Run app\n", + "Run the application and check your IP for the tunneling" + ], + "metadata": { + "id": "CjDhvbGhHuiz" + } + }, + { + "cell_type": "code", + "source": [ + "!streamlit run app.py &>/content/logs.txt & curl ipv4.icanhazip.com" + ], + "metadata": { + "id": "851CeYi8jvuF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "46a64023-e990-4900-f482-5558237f08cc" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "34.138.156.22\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create the tunnel\n", + "Run the tunnel and use the link below to connect to the tunnel.\n", + "\n", + "Use the IP from the previous step to connect to the application" + ], + "metadata": { + "id": "4OuSLFHyHy5M" + } + }, + { + "cell_type": "code", + "source": [ + "!npx localtunnel --port 8501" + ], + "metadata": { + "id": "inF7ceBmjyE3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "559ce180-3f0f-4475-c9a9-46dc91389276" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[K\u001b[?25hnpx: installed 22 in 2.186s\n", + "your url is: https://nine-facts-act.loca.lt\n", + "^C\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Resources\n", + "\n", + "Blog: https://www.elastic.co/blog/implement-image-similarity-search-elastic\n", + "\n", + "GH : https://github.com/radoondas/flask-elastic-image-search\n" + ], + "metadata": { + "id": "SbxbVzvQ7caR" + } + } + ] +} \ No newline at end of file From 54ccdf95808ef3e49e21aeeefaa7d7b67adac166 Mon Sep 17 00:00:00 2001 From: Miguel Grinberg Date: Thu, 14 Mar 2024 11:05:29 +0000 Subject: [PATCH 2/3] rename and relocate the notebook --- notebooks/images/image-similarity.ipynb | 1284 ++++++++++++++++++++ notebooks/search/04-image-similarity.ipynb | 1264 ------------------- 2 files changed, 1284 insertions(+), 1264 deletions(-) create mode 100644 notebooks/images/image-similarity.ipynb delete mode 100644 notebooks/search/04-image-similarity.ipynb diff --git a/notebooks/images/image-similarity.ipynb b/notebooks/images/image-similarity.ipynb new file mode 100644 index 00000000..46e01eb1 --- /dev/null +++ b/notebooks/images/image-similarity.ipynb @@ -0,0 +1,1284 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# How to implement Image search using Elasticsearch" + ], + "metadata": { + "id": "CepGq3Kvtdxi" + } + }, + { + "cell_type": "markdown", + "source": [ + "The workbook shows how to implement an Image search using Elasticsearch. You will index documents with image embeddings (generated or pre-generated) and then using NLP model be able to search using natural language description of the image.\n", + "\n", + "### Prerequisities\n", + "Before you start make sure you have Elasticsearch cluster running. The cluster must have at least one machine learning (ML) node with enough (4GB) memory." + ], + "metadata": { + "id": "oMu1SW_TQQrU" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Install Python requirements\n", + "Before you start you need to install all required Python dependencies." + ], + "metadata": { + "id": "VFcdr8IDQE_H" + } + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6WosfR55npKU", + "outputId": "033767ff-0eef-48cc-c9e7-efbf73c9cb67" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", + "Requirement already satisfied: eland in /usr/local/lib/python3.10/dist-packages (8.7.0)\n", + "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.9.0)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.31.0)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.65.0)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (9.4.0)\n", + "Requirement already satisfied: streamlit in /usr/local/lib/python3.10/dist-packages (1.25.0)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.15.2+cu118)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.22.4)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.10.1)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n", + "Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.4)\n", + "Requirement already satisfied: pandas>=1.5 in /usr/local/lib/python3.10/dist-packages (from eland) (1.5.3)\n", + "Requirement already satisfied: matplotlib>=3.6 in /usr/local/lib/python3.10/dist-packages (from eland) (3.7.1)\n", + "Requirement already satisfied: elastic-transport<9,>=8 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.4.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.1)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.7.1)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.6)\n", + "Requirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.2.2)\n", + "Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit) (1.4)\n", + "Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.3.1)\n", + "Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.1.6)\n", + "Requirement already satisfied: importlib-metadata<7,>=1.4 in /usr/lib/python3/dist-packages (from streamlit) (4.6.4)\n", + "Requirement already satisfied: protobuf<5,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.20.3)\n", + "Requirement already satisfied: pyarrow>=6.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.0.0)\n", + "Requirement already satisfied: pympler<2,>=0.9 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.0.1)\n", + "Requirement already satisfied: python-dateutil<3,>=2.7.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.8.2)\n", + "Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (13.4.2)\n", + "Requirement already satisfied: tenacity<9,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.2.2)\n", + "Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.10.2)\n", + "Requirement already satisfied: tzlocal<5,>=1.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.3.1)\n", + "Requirement already satisfied: validators<1,>=0.2 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.20.0)\n", + "Requirement already satisfied: gitpython!=3.1.19,<4,>=3.0.7 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.1.32)\n", + "Requirement already satisfied: pydeck<1,>=0.8 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.8.0)\n", + "Requirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.3.1)\n", + "Requirement already satisfied: watchdog>=2.1.5 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.0.0)\n", + "Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.4)\n", + "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (4.3.3)\n", + "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.12.0)\n", + "Requirement already satisfied: urllib3<2,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (1.26.16)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2023.7.22)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from gitpython!=3.1.19,<4,>=3.0.7->streamlit) (4.0.10)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.6.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (4.41.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.4.4)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (3.1.0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.5->eland) (2022.7.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3,>=2.7.3->streamlit) (1.16.0)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (2.14.0)\n", + "Requirement already satisfied: pytz-deprecation-shim in /usr/local/lib/python3.10/dist-packages (from tzlocal<5,>=1.1->streamlit) (0.1.0.post0)\n", + "Requirement already satisfied: decorator>=3.4.0 in /usr/local/lib/python3.10/dist-packages (from validators<1,>=0.2->streamlit) (4.4.2)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit) (5.0.0)\n", + "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (23.1.0)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.19.3)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit) (0.1.2)\n", + "Requirement already satisfied: tzdata in /usr/local/lib/python3.10/dist-packages (from pytz-deprecation-shim->tzlocal<5,>=1.1->streamlit) (2023.3)\n" + ] + } + ], + "source": [ + "!pip install sentence-transformers eland elasticsearch transformers torch tqdm Pillow streamlit" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Upload NLP model for querying\n", + "Using an `eland` library you will import NLP CLIP model. The model will tran\n", + "sfer your search query into vector which will be used for the search over the set of images stored in Elasticsearch.\n", + "\n", + "The model used is [clip-ViT-B-32-multilingual-v1](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1) because the image embeddings are also generated by the CLIP model.\n", + "\n", + "How to get cloud id? Go to [ESS cloud](https://cloud.elastic.co/logout?redirectTo=%2Fhome&reason=unauthorised) and `On the deployment overview page, copy down the Cloud ID.`\n", + "\n", + "The authentication is using api key (`--es-api-key`). Learn how to generate [API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html#create-api-key).\n", + "```\n", + "$ eland_import_hub_model --cloud-id $CLOUD_ID \\\n", + " --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 \\\n", + " --task-type text_embedding --es-api-key $API_KEY --start\n", + "```" + ], + "metadata": { + "id": "eIV5lAnVt9L7" + } + }, + { + "cell_type": "code", + "source": [ + "API_KEY = \"\"\n", + "CLOUD_ID = \"\"\n", + "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 --task-type text_embedding --es-api-key API_KEY --start" + ], + "metadata": { + "id": "tVhL9jBnuAAQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Import Python libraries" + ], + "metadata": { + "id": "hVxWnFflUCZv" + } + }, + { + "cell_type": "code", + "source": [ + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.helpers import parallel_bulk\n", + "import requests\n", + "import os\n", + "import sys\n", + "\n", + "# import shutil\n", + "import zipfile\n", + "from tqdm.auto import tqdm\n", + "import pandas as pd\n", + "from PIL import Image\n", + "from sentence_transformers import SentenceTransformer\n", + "import urllib.request\n", + "\n", + "# import urllib.error\n", + "import json\n", + "from getpass import getpass" + ], + "metadata": { + "id": "I0pRCbYMuMVn" + }, + "execution_count": 17, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Connect to Elasticsearch cluster\n", + "Use your own cluster details `ELASTIC_CLOUD_ID`, `API_KEY`." + ], + "metadata": { + "id": "Klv3rywdUJBN" + } + }, + { + "cell_type": "code", + "source": [ + "# ESS Cloud connection definition using an API_KEY\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "API_KEY = getpass(\"Elastic API key: \")\n", + "\n", + "# ELASTIC_CLOUD_USER = \"elastic\"\n", + "# CLOUD_PASSWORD = getpass(\"Elastic Password\")\n", + "\n", + "es = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " # basic_auth=(ELASTIC_CLOUD_USER, ELASTIC_CLOUD_PASSWORD),\n", + " api_key=API_KEY,\n", + " request_timeout=600,\n", + ")\n", + "\n", + "es.info() # should return cluster info" + ], + "metadata": { + "id": "YwN8RmFY3FQI", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d0d0e31e-2ad2-46fe-ef8c-8c8bce7e1c48" + }, + "execution_count": 19, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elastic Cloud ID: ··········\n", + "Elastic API key: ··········\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "ObjectApiResponse({'name': 'instance-0000000000', 'cluster_name': 'a597bbe1e0d047c494e7d4015f67ef37', 'cluster_uuid': 'EnT0vwwSSZeAahPw3Vhsuw', 'version': {'number': '8.8.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98e1271edf932a480e4262a471281f1ee295ce6b', 'build_date': '2023-06-26T05:16:16.196344851Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create Index and mappings for Images\n", + "Befor you can index documents into Elasticsearch, you need to create an Index with correct mappings." + ], + "metadata": { + "id": "IW-GIlH2OxB4" + } + }, + { + "cell_type": "code", + "source": [ + "# Destination Index name\n", + "INDEX_NAME = \"images\"\n", + "# If you want to delete previous version of the Index\n", + "DELETE_INDEX = False\n", + "\n", + "INDEX_MAPPING = {\n", + " \"properties\": {\n", + " \"image_embedding\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 512,\n", + " \"index\": True,\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"photo_id\": {\"type\": \"keyword\"},\n", + " \"photo_image_url\": {\"type\": \"keyword\"},\n", + " \"ai_description\": {\"type\": \"text\"},\n", + " \"photo_description\": {\"type\": \"text\"},\n", + " \"photo_url\": {\"type\": \"keyword\"},\n", + " \"photographer_first_name\": {\"type\": \"keyword\"},\n", + " \"photographer_last_name\": {\"type\": \"keyword\"},\n", + " \"photographer_username\": {\"type\": \"keyword\"},\n", + " \"exif_camera_make\": {\"type\": \"keyword\"},\n", + " \"exif_camera_model\": {\"type\": \"keyword\"},\n", + " \"exif_iso\": {\"type\": \"integer\"},\n", + " }\n", + "}\n", + "\n", + "# Index settings\n", + "INDEX_SETTINGS = {\n", + " \"index\": {\n", + " \"number_of_replicas\": \"1\",\n", + " \"number_of_shards\": \"1\",\n", + " \"refresh_interval\": \"5s\",\n", + " }\n", + "}\n", + "\n", + "if DELETE_INDEX:\n", + " if es.indices.exists(index=INDEX_NAME):\n", + " print(\"Deleting existing %s\" % INDEX_NAME)\n", + " es.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", + "\n", + "if not es.indices.exists(index=INDEX_NAME):\n", + " print(\"Creating index %s\" % INDEX_NAME)\n", + " es.indices.create(\n", + " index=INDEX_NAME,\n", + " mappings=INDEX_MAPPING,\n", + " settings=INDEX_SETTINGS,\n", + " ignore=[400, 404],\n", + " )" + ], + "metadata": { + "id": "xAkc1OVcOxy3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Get image dataset and embeddings\n", + "Download:\n", + "- The example image dataset is from [Unsplash](https://github.com/unsplash/datasets)\n", + "- The [Image embeddings](https://github.com/radoondas/flask-elastic-nlp/blob/main/embeddings/blogs/blogs-no-embeddings.json.zip) are pre-generated using CLIP model\n", + "\n", + "Then unzip both files." + ], + "metadata": { + "id": "NKE-j0kPUMn_" + } + }, + { + "cell_type": "code", + "source": [ + "!wget https://unsplash.com/data/lite/1.2.0 -O data/unsplash-research-dataset-lite-1.2.0.zip\n", + "!wget https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip -P data" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zFGaPDRR5mqT", + "outputId": "0114cdd6-a714-41ab-9b46-3013bd36698a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-07-25 14:28:32-- https://unsplash.com/data/lite/1.2.0\n", + "Resolving unsplash.com (unsplash.com)... 151.101.65.181, 151.101.1.181, 151.101.129.181, ...\n", + "Connecting to unsplash.com (unsplash.com)|151.101.65.181|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip [following]\n", + "--2023-07-25 14:28:32-- https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip\n", + "Resolving unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)... 52.217.102.84, 3.5.25.253, 52.217.96.188, ...\n", + "Connecting to unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)|52.217.102.84|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 632351052 (603M) [application/zip]\n", + "Saving to: ‘data/unsplash-research-dataset-lite-1.2.0.zip’\n", + "\n", + "data/unsplash-resea 100%[===================>] 603.06M 14.1MB/s in 42s \n", + "\n", + "2023-07-25 14:29:16 (14.2 MB/s) - ‘data/unsplash-research-dataset-lite-1.2.0.zip’ saved [632351052/632351052]\n", + "\n", + "--2023-07-25 14:29:16-- https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 94256217 (90M) [application/zip]\n", + "Saving to: ‘data/image-embeddings.json.zip.1’\n", + "\n", + "image-embeddings.js 100%[===================>] 89.89M 164MB/s in 0.5s \n", + "\n", + "2023-07-25 14:29:16 (164 MB/s) - ‘data/image-embeddings.json.zip.1’ saved [94256217/94256217]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Unzip downloaded files\n", + "UNSPLASH_ZIP_FILE = \"data/unsplash-research-dataset-lite-1.2.0.zip\"\n", + "EMBEDDINGS_ZIP_FILE = \"data/image-embeddings.json.zip\"\n", + "\n", + "with zipfile.ZipFile(UNSPLASH_ZIP_FILE, \"r\") as zip_ref:\n", + " print(\"Extracting file \", UNSPLASH_ZIP_FILE, \".\")\n", + " zip_ref.extractall(\"data/unsplash/\")\n", + "\n", + "with zipfile.ZipFile(EMBEDDINGS_ZIP_FILE, \"r\") as zip_ref:\n", + " print(\"Extracting file \", EMBEDDINGS_ZIP_FILE, \".\")\n", + " zip_ref.extractall(\"data/embeddings/\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MBh4AQ8i7C0-", + "outputId": "17a50b7f-f052-4b72-daa8-0e8fc630326f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Extracting file data/unsplash-research-dataset-lite-1.2.0.zip .\n", + "Extracting file data/image-embeddings.json.zip .\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Image embeddings\n", + "We have now 2 options how to proceed.\n", + "1. Import all pregenerated image embeddings (~19k). This is faster option with a lot of images available in a short time.\n", + "2. Import a small subset of randomly choosen images to see the process of generating of image embeddings using external Clip model." + ], + "metadata": { + "id": "p6H7QYctQQA7" + } + }, + { + "cell_type": "code", + "source": [ + "# define helper function\n", + "def gen_rows(df):\n", + " for doc in df.to_dict(orient=\"records\"):\n", + " yield doc" + ], + "metadata": { + "id": "03YvC-_JY9OE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 1. Import all pregenerated image embeddings\n", + "This option lets you inport ~19k documents woth pregenenerated image embeddings with metadata.\n", + "\n", + "The process downloads files with images information, merge them and index into Elasticsearch." + ], + "metadata": { + "id": "qhZRdUyAQd-s" + } + }, + { + "cell_type": "code", + "source": [ + "df_unsplash = pd.read_csv(\"data/unsplash/\" + \"photos.tsv000\", sep=\"\\t\", header=0)\n", + "\n", + "# follwing 8 lines are fix for inconsistent/incorrect data\n", + "df_unsplash[\"photo_description\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"ai_description\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"photographer_first_name\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"photographer_last_name\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"photographer_username\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"exif_camera_make\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"exif_camera_model\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"exif_iso\"].fillna(0, inplace=True)\n", + "## end of fix\n", + "\n", + "# read subset of columns from the original/downloaded dataset\n", + "df_unsplash_subset = df_unsplash[\n", + " [\n", + " \"photo_id\",\n", + " \"photo_url\",\n", + " \"photo_image_url\",\n", + " \"photo_description\",\n", + " \"ai_description\",\n", + " \"photographer_first_name\",\n", + " \"photographer_last_name\",\n", + " \"photographer_username\",\n", + " \"exif_camera_make\",\n", + " \"exif_camera_model\",\n", + " \"exif_iso\",\n", + " ]\n", + "]\n", + "\n", + "# read all pregenerated embeddings\n", + "df_embeddings = pd.read_json(\"data/embeddings/\" + \"image-embeddings.json\", lines=True)\n", + "\n", + "df_merged = pd.merge(df_unsplash_subset, df_embeddings, on=\"photo_id\", how=\"inner\")\n", + "\n", + "count = 0\n", + "for success, info in parallel_bulk(\n", + " client=es,\n", + " actions=gen_rows(df_merged),\n", + " thread_count=5,\n", + " chunk_size=1000,\n", + " index=INDEX_NAME,\n", + "):\n", + " if success:\n", + " count += 1\n", + " if count % 1000 == 0:\n", + " print(\"Indexed %s documents\" % str(count), flush=True)\n", + " sys.stdout.flush()\n", + " else:\n", + " print(\"Doc failed\", info)\n", + "\n", + "print(\"Indexed %s image embeddings documents\" % str(count), flush=True)\n", + "sys.stdout.flush()" + ], + "metadata": { + "id": "32xrbSUXTODQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 2. Import small number of random\n", + "In this part you will randomly choose small number of images and then generate image embeddings. The script will then index documents into Elasticserach." + ], + "metadata": { + "id": "xypgh4lFQmkc" + } + }, + { + "cell_type": "code", + "source": [ + "NUMBER_OF_IMAGES = 20\n", + "INDEX_NAME = \"images-test\"\n", + "\n", + "df_unsplash = pd.read_csv(\"data/unsplash/\" + \"photos.tsv000\", sep=\"\\t\", header=0)\n", + "\n", + "## stat fix\n", + "# follwing 8 lines are fix for inconsistent/incorrect data\n", + "df_unsplash[\"photo_description\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"ai_description\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"photographer_first_name\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"photographer_last_name\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"photographer_username\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"exif_camera_make\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"exif_camera_model\"].fillna(\"\", inplace=True)\n", + "df_unsplash[\"exif_iso\"].fillna(0, inplace=True)\n", + "## end of fix\n", + "\n", + "df_unsplash_subset = df_unsplash[\n", + " [\n", + " \"photo_id\",\n", + " \"photo_url\",\n", + " \"photo_image_url\",\n", + " \"photo_description\",\n", + " \"ai_description\",\n", + " \"photographer_first_name\",\n", + " \"photographer_last_name\",\n", + " \"photographer_username\",\n", + " \"exif_camera_make\",\n", + " \"exif_camera_model\",\n", + " \"exif_iso\",\n", + " ]\n", + "]\n", + "\n", + "df_random_subset = df_unsplash_subset.sample(n=NUMBER_OF_IMAGES, replace=False)\n", + "df_random_subset = df_random_subset.reset_index()\n", + "\n", + "# Load model CLIP\n", + "img_model = SentenceTransformer(\"clip-ViT-B-32\")\n", + "\n", + "# new list of image documents for indexing into ES\n", + "lst = []\n", + "if not os.path.exists(\"data/images\"):\n", + " os.mkdir(\"data/images\")\n", + "\n", + "for index, row in df_random_subset.iterrows():\n", + " # open image from url\n", + " img_path = \"data/images/\" + row[\"photo_id\"]\n", + " try:\n", + " urllib.request.urlretrieve(row[\"photo_image_url\"], img_path)\n", + " print(row[\"photo_id\"] + \" \" + row[\"photo_url\"])\n", + " except urllib.error.HTTPError as err:\n", + " if err.code == 404:\n", + " print(\"404 error: Image not found at {}\".format(row[\"photo_image_url\"]))\n", + " else:\n", + " raise\n", + "\n", + " img = Image.open(img_path)\n", + " # create doc\n", + " doc = {}\n", + " embedding = img_model.encode(img)\n", + " doc[\"photo_id\"] = row[\"photo_id\"]\n", + " doc[\"image_embedding\"] = embedding.tolist()\n", + " lst.append(doc)\n", + " # print(doc)\n", + "\n", + " # Image cleanup.\n", + " # If file exists, delete it.\n", + " if os.path.exists(img_path):\n", + " os.remove(img_path)\n", + "\n", + "# read all pregenerated embeddings\n", + "df_embeddings = pd.read_json(\"data/embeddings/\" + \"image-embeddings.json\", lines=True)\n", + "\n", + "df_merged = pd.merge(df_random_subset, pd.DataFrame(lst), on=\"photo_id\", how=\"inner\")\n", + "# print(df_merged)\n", + "\n", + "count = 0\n", + "for success, info in parallel_bulk(\n", + " client=es,\n", + " actions=gen_rows(df_merged),\n", + " thread_count=5,\n", + " chunk_size=10,\n", + " index=INDEX_NAME,\n", + "):\n", + " if success:\n", + " count += 1\n", + " if count % 10 == 0:\n", + " print(\"Indexed %s documents\" % str(count), flush=True)\n", + " sys.stdout.flush()\n", + " else:\n", + " print(\"Doc failed\", info)\n", + "\n", + "print(\"Indexed %s image embeddings documents\" % str(count), flush=True)\n", + "sys.stdout.flush()" + ], + "metadata": { + "id": "r_txQjP2RKnr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Query the image dataset\n", + "The next step is to run a query to search for images. The example query searches for `\"model_text\": \"Valentine day flowers\"` using the model we uploaded to Elasticsearch `sentence-transformers__clip-vit-b-32-multilingual-v1`.\n", + "\n", + "The process is one query even it internaly consists of two tasks. One is to tramsform your search text into a vector using the NLP model and the second task is to run the vector search over the image dataset.\n", + "```\n", + "POST images/_search\n", + "{\n", + " \"knn\": {\n", + " \"field\": \"image_embedding\",\n", + " \"k\": 5,\n", + " \"num_candidates\": 10,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__clip-vit-b-32-multilingual-v1\",\n", + " \"model_text\": \"Valentine day flowers\"\n", + " }\n", + " }\n", + " },\n", + " \"fields\": [\n", + " \"photo_description\",\n", + " \"ai_description\",\n", + " \"photo_url\"\n", + " ],\n", + " \"_source\": false\n", + "}\n", + "```\n", + "\n" + ], + "metadata": { + "id": "-_i2CIpSz9vw" + } + }, + { + "cell_type": "code", + "source": [ + "# Search queary\n", + "WHAT_ARE_YOU_LOOKING_FOR = \"Valentine day flowers\"\n", + "INDEX_IM_EMBED = \"images\"\n", + "\n", + "source_fields = [\n", + " \"photo_description\",\n", + " \"ai_description\",\n", + " \"photo_url\",\n", + " \"photo_image_url\",\n", + " \"photographer_first_name\",\n", + " \"photographer_username\",\n", + " \"photographer_last_name\",\n", + " \"photo_id\",\n", + "]\n", + "query = {\n", + " \"field\": \"image_embedding\",\n", + " \"k\": 5,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__clip-vit-b-32-multilingual-v1\",\n", + " \"model_text\": WHAT_ARE_YOU_LOOKING_FOR,\n", + " }\n", + " },\n", + "}\n", + "\n", + "response = es.search(\n", + " index=INDEX_IM_EMBED, fields=source_fields, knn=query, source=False\n", + ")\n", + "\n", + "print(response.body)\n", + "\n", + "# the code writes the response into a file for the streamlit UI used in the optional step.\n", + "with open(\"json_data.json\", \"w\") as outfile:\n", + " json.dump(response.body[\"hits\"][\"hits\"], outfile)\n", + "\n", + "# Use the `loads()` method to load the JSON data\n", + "dfr = json.loads(json.dumps(response.body[\"hits\"][\"hits\"]))\n", + "# Pass the generated JSON data into a pandas dataframe\n", + "dfr = pd.DataFrame(dfr)\n", + "# Print the data frame\n", + "dfr\n", + "\n", + "results = pd.json_normalize(json.loads(json.dumps(response.body[\"hits\"][\"hits\"])))\n", + "# results\n", + "results[\n", + " [\n", + " \"_id\",\n", + " \"_score\",\n", + " \"fields.photo_id\",\n", + " \"fields.photo_image_url\",\n", + " \"fields.photo_description\",\n", + " \"fields.photographer_first_name\",\n", + " \"fields.photographer_last_name\",\n", + " \"fields.ai_description\",\n", + " \"fields.photo_url\",\n", + " ]\n", + "]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 375 + }, + "id": "wdicpvRlzmXG", + "outputId": "00550041-0aed-4f51-ccd3-18eb705ff7ed" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'took': 114, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 0.646751, 'hits': [{'_index': 'images', '_id': 'nK5Fh4kBLg4Kd5ySLbKC', '_score': 0.646751, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1587404787163-d03a28fcc1f0'], 'photo_id': ['gQFZxLe3m4g'], 'photographer_first_name': ['Vadim'], 'photo_description': ['instagram.com/vadimsadovski'], 'photographer_last_name': ['Sadovski'], 'photo_url': ['https://unsplash.com/photos/gQFZxLe3m4g'], 'photographer_username': ['vadimsadovski'], 'ai_description': ['']}}, {'_index': 'images', '_id': 'Xa5Eh4kBLg4Kd5yS84Qf', '_score': 0.64675057, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1574259148543-dd376a61339f'], 'photo_id': ['g5Mhx29yp-A'], 'photographer_first_name': ['Erin'], 'photo_description': ['Cute but grumpy cat in the Austrian mountains'], 'photographer_last_name': ['East'], 'photo_url': ['https://unsplash.com/photos/g5Mhx29yp-A'], 'photographer_username': ['mserineast'], 'ai_description': ['brown Persian cat on white bench']}}, {'_index': 'images', '_id': '265Eh4kBLg4Kd5yS84Uf', '_score': 0.64244866, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1565772101068-dec21f7b36eb'], 'photo_id': ['9KZ0PGNCxNE'], 'photographer_first_name': ['Perchek'], 'photo_description': [''], 'photographer_last_name': ['Industrie'], 'photo_url': ['https://unsplash.com/photos/9KZ0PGNCxNE'], 'photographer_username': ['perchek_industrie'], 'ai_description': ['siamese cat']}}, {'_index': 'images', '_id': 'xq5Fh4kBLg4Kd5ySEpuC', '_score': 0.64216036, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1589994205353-325f40210d02'], 'photo_id': ['VOZQkkKXvY4'], 'photographer_first_name': ['Andrey'], 'photo_description': [''], 'photographer_last_name': ['Svistunov'], 'photo_url': ['https://unsplash.com/photos/VOZQkkKXvY4'], 'photographer_username': ['svistal13'], 'ai_description': ['orange tabby cat on ground covered with snow during daytime']}}, {'_index': 'images', '_id': 'WK5Eh4kBLg4Kd5yS5XcD', '_score': 0.64185303, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1556820161-b605d166fce1'], 'photo_id': ['wmz8y6L6c_k'], 'photographer_first_name': ['Phillip'], 'photo_description': [''], 'photographer_last_name': ['Suitcases'], 'photo_url': ['https://unsplash.com/photos/wmz8y6L6c_k'], 'photographer_username': ['nillait'], 'ai_description': ['brown and black kitten close-up photography']}}]}}\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " _id _score fields.photo_id \\\n", + "0 nK5Fh4kBLg4Kd5ySLbKC 0.646751 [gQFZxLe3m4g] \n", + "1 Xa5Eh4kBLg4Kd5yS84Qf 0.646751 [g5Mhx29yp-A] \n", + "2 265Eh4kBLg4Kd5yS84Uf 0.642449 [9KZ0PGNCxNE] \n", + "3 xq5Fh4kBLg4Kd5ySEpuC 0.642160 [VOZQkkKXvY4] \n", + "4 WK5Eh4kBLg4Kd5yS5XcD 0.641853 [wmz8y6L6c_k] \n", + "\n", + " fields.photo_image_url \\\n", + "0 [https://images.unsplash.com/photo-15874047871... \n", + "1 [https://images.unsplash.com/photo-15742591485... \n", + "2 [https://images.unsplash.com/photo-15657721010... \n", + "3 [https://images.unsplash.com/photo-15899942053... \n", + "4 [https://images.unsplash.com/photo-1556820161-... \n", + "\n", + " fields.photo_description \\\n", + "0 [instagram.com/vadimsadovski] \n", + "1 [Cute but grumpy cat in the Austrian mountains] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + "\n", + " fields.photographer_first_name fields.photographer_last_name \\\n", + "0 [Vadim] [Sadovski] \n", + "1 [Erin] [East] \n", + "2 [Perchek] [Industrie] \n", + "3 [Andrey] [Svistunov] \n", + "4 [Phillip] [Suitcases] \n", + "\n", + " fields.ai_description \\\n", + "0 [] \n", + "1 [brown Persian cat on white bench] \n", + "2 [siamese cat] \n", + "3 [orange tabby cat on ground covered with snow ... \n", + "4 [brown and black kitten close-up photography] \n", + "\n", + " fields.photo_url \n", + "0 [https://unsplash.com/photos/gQFZxLe3m4g] \n", + "1 [https://unsplash.com/photos/g5Mhx29yp-A] \n", + "2 [https://unsplash.com/photos/9KZ0PGNCxNE] \n", + "3 [https://unsplash.com/photos/VOZQkkKXvY4] \n", + "4 [https://unsplash.com/photos/wmz8y6L6c_k] " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_id_scorefields.photo_idfields.photo_image_urlfields.photo_descriptionfields.photographer_first_namefields.photographer_last_namefields.ai_descriptionfields.photo_url
0nK5Fh4kBLg4Kd5ySLbKC0.646751[gQFZxLe3m4g][https://images.unsplash.com/photo-15874047871...[instagram.com/vadimsadovski][Vadim][Sadovski][][https://unsplash.com/photos/gQFZxLe3m4g]
1Xa5Eh4kBLg4Kd5yS84Qf0.646751[g5Mhx29yp-A][https://images.unsplash.com/photo-15742591485...[Cute but grumpy cat in the Austrian mountains][Erin][East][brown Persian cat on white bench][https://unsplash.com/photos/g5Mhx29yp-A]
2265Eh4kBLg4Kd5yS84Uf0.642449[9KZ0PGNCxNE][https://images.unsplash.com/photo-15657721010...[][Perchek][Industrie][siamese cat][https://unsplash.com/photos/9KZ0PGNCxNE]
3xq5Fh4kBLg4Kd5ySEpuC0.642160[VOZQkkKXvY4][https://images.unsplash.com/photo-15899942053...[][Andrey][Svistunov][orange tabby cat on ground covered with snow ...[https://unsplash.com/photos/VOZQkkKXvY4]
4WK5Eh4kBLg4Kd5yS5XcD0.641853[wmz8y6L6c_k][https://images.unsplash.com/photo-1556820161-...[][Phillip][Suitcases][brown and black kitten close-up photography][https://unsplash.com/photos/wmz8y6L6c_k]
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# [Optional] Simple streamlit UI\n", + "In the following section, you will view the response in a simple UI for better visualisation.\n", + "\n", + "The query in the previous step did write down a file response `json_data.json` for the UI to load and visualise.\n", + "\n", + "Follow the steps below to see the results in a table." + ], + "metadata": { + "id": "Ry62sfHFHFi9" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Install tunnel library" + ], + "metadata": { + "id": "iUAbRqr8II-x" + } + }, + { + "cell_type": "code", + "source": [ + "!npm install localtunnel" + ], + "metadata": { + "id": "RGEmAt2DjtN7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f6c37d54-7e09-4e59-fc21-8a3db4fa840d" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[K\u001b[?25h\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35msaveError\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[34;40mnotice\u001b[0m\u001b[35m\u001b[0m created a lockfile as package-lock.json. You should commit this file.\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35menoent\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No description\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No repository field.\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No README data\n", + "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No license field.\n", + "\u001b[0m\n", + "\u001b[K\u001b[?25h+ localtunnel@2.0.2\n", + "added 22 packages from 22 contributors and audited 22 packages in 5.903s\n", + "\n", + "3 packages are looking for funding\n", + " run `npm fund` for details\n", + "\n", + "found \u001b[92m0\u001b[0m vulnerabilities\n", + "\n", + "\u001b[K\u001b[?25h" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create application" + ], + "metadata": { + "id": "KUAfucnYITka" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile app.py\n", + "\n", + "import streamlit as st\n", + "import json\n", + "import pandas as pd\n", + "\n", + "\n", + "def get_image_preview(image_url):\n", + " \"\"\"Returns an HTML tag with preview of the image.\"\"\"\n", + " return f\"\"\"\"\"\"\n", + "\n", + "\n", + "def get_url_link(photo_url):\n", + " \"\"\"Returns an HTML tag to the image page.\"\"\"\n", + " return f\"\"\" {photo_url} \"\"\"\n", + "\n", + "\n", + "def main():\n", + " \"\"\"Creates a Streamlit app with a table of images.\"\"\"\n", + " data = json.load(open(\"json_data.json\"))\n", + " table = []\n", + " for image in data:\n", + " image_url = image[\"fields\"][\"photo_image_url\"][0]\n", + " image_preview = get_image_preview(image_url)\n", + " photo_url = image[\"fields\"][\"photo_url\"][0]\n", + " photo_url_link = get_url_link(photo_url)\n", + " table.append([image_preview, image[\"fields\"][\"photo_id\"][0],\n", + " image[\"fields\"][\"photographer_first_name\"][0],\n", + " image[\"fields\"][\"photographer_last_name\"][0],\n", + " image[\"fields\"][\"photographer_username\"][0],\n", + " photo_url_link])\n", + "\n", + " st.write(pd.DataFrame(table, columns=[\"Image\", \"ID\", \"First Name\", \"Last Name\",\n", + " \"Photographer username\", \"Photo url\"]).to_html(escape = False),\n", + " unsafe_allow_html=True)\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n", + "\n" + ], + "metadata": { + "id": "9Wb7GOWMXFnF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6db23ef3-b25e-4f80-a3cb-6d08c1c78c16" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting app.py\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Run app\n", + "Run the application and check your IP for the tunneling" + ], + "metadata": { + "id": "CjDhvbGhHuiz" + } + }, + { + "cell_type": "code", + "source": [ + "!streamlit run app.py &>/content/logs.txt & curl ipv4.icanhazip.com" + ], + "metadata": { + "id": "851CeYi8jvuF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "46a64023-e990-4900-f482-5558237f08cc" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "34.138.156.22\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create the tunnel\n", + "Run the tunnel and use the link below to connect to the tunnel.\n", + "\n", + "Use the IP from the previous step to connect to the application" + ], + "metadata": { + "id": "4OuSLFHyHy5M" + } + }, + { + "cell_type": "code", + "source": [ + "!npx localtunnel --port 8501" + ], + "metadata": { + "id": "inF7ceBmjyE3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "559ce180-3f0f-4475-c9a9-46dc91389276" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[K\u001b[?25hnpx: installed 22 in 2.186s\n", + "your url is: https://nine-facts-act.loca.lt\n", + "^C\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Resources\n", + "\n", + "Blog: https://www.elastic.co/blog/implement-image-similarity-search-elastic\n", + "\n", + "GH : https://github.com/radoondas/flask-elastic-image-search\n" + ], + "metadata": { + "id": "SbxbVzvQ7caR" + } + } + ] +} \ No newline at end of file diff --git a/notebooks/search/04-image-similarity.ipynb b/notebooks/search/04-image-similarity.ipynb deleted file mode 100644 index b5069481..00000000 --- a/notebooks/search/04-image-similarity.ipynb +++ /dev/null @@ -1,1264 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# How to implement Image search using Elasticsearch" - ], - "metadata": { - "id": "CepGq3Kvtdxi" - } - }, - { - "cell_type": "markdown", - "source": [ - "The workbook shows how to implement an Image search using Elasticsearch. You will index documents with image embeddings (generated or pre-generated) and then using NLP model be able to search using natural language description of the image.\n", - "\n", - "### Prerequisities\n", - "Before you start make sure you have Elasticsearch cluster running. The cluster must have at least one machine learning (ML) node with enough (4GB) memory." - ], - "metadata": { - "id": "oMu1SW_TQQrU" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Install Python requirements\n", - "Before you start you need to install all required Python dependencies." - ], - "metadata": { - "id": "VFcdr8IDQE_H" - } - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6WosfR55npKU", - "outputId": "033767ff-0eef-48cc-c9e7-efbf73c9cb67" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", - "Requirement already satisfied: eland in /usr/local/lib/python3.10/dist-packages (8.7.0)\n", - "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.9.0)\n", - "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.31.0)\n", - "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.65.0)\n", - "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (9.4.0)\n", - "Requirement already satisfied: streamlit in /usr/local/lib/python3.10/dist-packages (1.25.0)\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.15.2+cu118)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.22.4)\n", - "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.10.1)\n", - "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", - "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n", - "Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.4)\n", - "Requirement already satisfied: pandas>=1.5 in /usr/local/lib/python3.10/dist-packages (from eland) (1.5.3)\n", - "Requirement already satisfied: matplotlib>=3.6 in /usr/local/lib/python3.10/dist-packages (from eland) (3.7.1)\n", - "Requirement already satisfied: elastic-transport<9,>=8 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.4.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", - "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.1)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.7.1)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", - "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", - "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", - "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.6)\n", - "Requirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.2.2)\n", - "Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit) (1.4)\n", - "Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.3.1)\n", - "Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.1.6)\n", - "Requirement already satisfied: importlib-metadata<7,>=1.4 in /usr/lib/python3/dist-packages (from streamlit) (4.6.4)\n", - "Requirement already satisfied: protobuf<5,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.20.3)\n", - "Requirement already satisfied: pyarrow>=6.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.0.0)\n", - "Requirement already satisfied: pympler<2,>=0.9 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.0.1)\n", - "Requirement already satisfied: python-dateutil<3,>=2.7.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.8.2)\n", - "Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (13.4.2)\n", - "Requirement already satisfied: tenacity<9,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.2.2)\n", - "Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.10.2)\n", - "Requirement already satisfied: tzlocal<5,>=1.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.3.1)\n", - "Requirement already satisfied: validators<1,>=0.2 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.20.0)\n", - "Requirement already satisfied: gitpython!=3.1.19,<4,>=3.0.7 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.1.32)\n", - "Requirement already satisfied: pydeck<1,>=0.8 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.8.0)\n", - "Requirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.3.1)\n", - "Requirement already satisfied: watchdog>=2.1.5 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.0.0)\n", - "Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.4)\n", - "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (4.3.3)\n", - "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.12.0)\n", - "Requirement already satisfied: urllib3<2,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (1.26.16)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2023.7.22)\n", - "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from gitpython!=3.1.19,<4,>=3.0.7->streamlit) (4.0.10)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.6.0)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.1.0)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (0.11.0)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (4.41.1)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.4.4)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (3.1.0)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.5->eland) (2022.7.1)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3,>=2.7.3->streamlit) (1.16.0)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (3.0.0)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (2.14.0)\n", - "Requirement already satisfied: pytz-deprecation-shim in /usr/local/lib/python3.10/dist-packages (from tzlocal<5,>=1.1->streamlit) (0.1.0.post0)\n", - "Requirement already satisfied: decorator>=3.4.0 in /usr/local/lib/python3.10/dist-packages (from validators<1,>=0.2->streamlit) (4.4.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.1)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", - "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit) (5.0.0)\n", - "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (23.1.0)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.19.3)\n", - "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit) (0.1.2)\n", - "Requirement already satisfied: tzdata in /usr/local/lib/python3.10/dist-packages (from pytz-deprecation-shim->tzlocal<5,>=1.1->streamlit) (2023.3)\n" - ] - } - ], - "source": [ - "!pip install sentence-transformers eland elasticsearch transformers torch tqdm Pillow streamlit" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Upload NLP model for querying\n", - "Using an `eland` library you will import NLP CLIP model. The model will tran\n", - "sfer your search query into vector which will be used for the search over the set of images stored in Elasticsearch.\n", - "\n", - "The model used is [clip-ViT-B-32-multilingual-v1](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1) because the image embeddings are also generated by the CLIP model.\n", - "\n", - "How to get cloud id? Go to [ESS cloud](https://cloud.elastic.co/logout?redirectTo=%2Fhome&reason=unauthorised) and `On the deployment overview page, copy down the Cloud ID.`\n", - "\n", - "The authentication is using api key (`--es-api-key`). Learn how to generate [API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html#create-api-key).\n", - "```\n", - "$ eland_import_hub_model --cloud-id $CLOUD_ID \\\n", - " --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 \\\n", - " --task-type text_embedding --es-api-key $API_KEY --start\n", - "```" - ], - "metadata": { - "id": "eIV5lAnVt9L7" - } - }, - { - "cell_type": "code", - "source": [ - "API_KEY=''\n", - "CLOUD_ID=''\n", - "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 --task-type text_embedding --es-api-key API_KEY --start" - ], - "metadata": { - "id": "tVhL9jBnuAAQ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Import Python libraries" - ], - "metadata": { - "id": "hVxWnFflUCZv" - } - }, - { - "cell_type": "code", - "source": [ - "from elasticsearch import Elasticsearch\n", - "from elasticsearch.helpers import parallel_bulk\n", - "import requests\n", - "import os\n", - "import sys\n", - "# import shutil\n", - "import zipfile\n", - "from tqdm.auto import tqdm\n", - "import pandas as pd\n", - "from PIL import Image\n", - "from sentence_transformers import SentenceTransformer\n", - "import urllib.request\n", - "# import urllib.error\n", - "import json\n", - "from getpass import getpass" - ], - "metadata": { - "id": "I0pRCbYMuMVn" - }, - "execution_count": 17, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Connect to Elasticsearch cluster\n", - "Use your own cluster details `ELASTIC_CLOUD_ID`, `API_KEY`." - ], - "metadata": { - "id": "Klv3rywdUJBN" - } - }, - { - "cell_type": "code", - "source": [ - "# ESS Cloud connection definition using an API_KEY\n", - "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", - "API_KEY = getpass(\"Elastic API key: \")\n", - "\n", - "# ELASTIC_CLOUD_USER = \"elastic\"\n", - "# CLOUD_PASSWORD = getpass(\"Elastic Password\")\n", - "\n", - "es = Elasticsearch(\n", - " cloud_id=ELASTIC_CLOUD_ID,\n", - " #basic_auth=(ELASTIC_CLOUD_USER, ELASTIC_CLOUD_PASSWORD),\n", - " api_key=API_KEY,\n", - " request_timeout=600\n", - ")\n", - "\n", - "es.info() # should return cluster info" - ], - "metadata": { - "id": "YwN8RmFY3FQI", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "d0d0e31e-2ad2-46fe-ef8c-8c8bce7e1c48" - }, - "execution_count": 19, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elastic Cloud ID: ··········\n", - "Elastic API key: ··········\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "ObjectApiResponse({'name': 'instance-0000000000', 'cluster_name': 'a597bbe1e0d047c494e7d4015f67ef37', 'cluster_uuid': 'EnT0vwwSSZeAahPw3Vhsuw', 'version': {'number': '8.8.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98e1271edf932a480e4262a471281f1ee295ce6b', 'build_date': '2023-06-26T05:16:16.196344851Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" - ] - }, - "metadata": {}, - "execution_count": 19 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Create Index and mappings for Images\n", - "Befor you can index documents into Elasticsearch, you need to create an Index with correct mappings." - ], - "metadata": { - "id": "IW-GIlH2OxB4" - } - }, - { - "cell_type": "code", - "source": [ - "# Destination Index name\n", - "INDEX_NAME=\"images\"\n", - "# If you want to delete previous version of the Index\n", - "DELETE_INDEX=False\n", - "\n", - "INDEX_MAPPING = {\n", - " \"properties\": {\n", - " \"image_embedding\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 512,\n", - " \"index\": True,\n", - " \"similarity\": \"cosine\"\n", - " },\n", - " \"photo_id\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"photo_image_url\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"ai_description\": {\n", - " \"type\": \"text\"\n", - " },\n", - " \"photo_description\": {\n", - " \"type\": \"text\"\n", - " },\n", - " \"photo_url\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"photographer_first_name\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"photographer_last_name\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"photographer_username\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"exif_camera_make\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"exif_camera_model\": {\n", - " \"type\": \"keyword\"\n", - " },\n", - " \"exif_iso\": {\n", - " \"type\": \"integer\"\n", - " }\n", - " }\n", - " }\n", - "\n", - "# Index settings\n", - "INDEX_SETTINGS = {\n", - " \"index\": {\n", - " \"number_of_replicas\": \"1\",\n", - " \"number_of_shards\": \"1\",\n", - " \"refresh_interval\": \"5s\"\n", - " }\n", - "}\n", - "\n", - "if(DELETE_INDEX):\n", - " if es.indices.exists(index=INDEX_NAME):\n", - " print(\"Deleting existing %s\" % INDEX_NAME)\n", - " es.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", - "\n", - "if not es.indices.exists(index=INDEX_NAME):\n", - " print(\"Creating index %s\" % INDEX_NAME)\n", - " es.indices.create(index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS,\n", - " ignore=[400, 404])\n" - ], - "metadata": { - "id": "xAkc1OVcOxy3" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Get image dataset and embeddings\n", - "Download:\n", - "- The example image dataset is from [Unsplash](https://github.com/unsplash/datasets)\n", - "- The [Image embeddings](https://github.com/radoondas/flask-elastic-nlp/blob/main/embeddings/blogs/blogs-no-embeddings.json.zip) are pre-generated using CLIP model\n", - "\n", - "Then unzip both files." - ], - "metadata": { - "id": "NKE-j0kPUMn_" - } - }, - { - "cell_type": "code", - "source": [ - "!wget https://unsplash.com/data/lite/1.2.0 -O data/unsplash-research-dataset-lite-1.2.0.zip\n", - "!wget https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip -P data" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zFGaPDRR5mqT", - "outputId": "0114cdd6-a714-41ab-9b46-3013bd36698a" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2023-07-25 14:28:32-- https://unsplash.com/data/lite/1.2.0\n", - "Resolving unsplash.com (unsplash.com)... 151.101.65.181, 151.101.1.181, 151.101.129.181, ...\n", - "Connecting to unsplash.com (unsplash.com)|151.101.65.181|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip [following]\n", - "--2023-07-25 14:28:32-- https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip\n", - "Resolving unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)... 52.217.102.84, 3.5.25.253, 52.217.96.188, ...\n", - "Connecting to unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)|52.217.102.84|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 632351052 (603M) [application/zip]\n", - "Saving to: ‘data/unsplash-research-dataset-lite-1.2.0.zip’\n", - "\n", - "data/unsplash-resea 100%[===================>] 603.06M 14.1MB/s in 42s \n", - "\n", - "2023-07-25 14:29:16 (14.2 MB/s) - ‘data/unsplash-research-dataset-lite-1.2.0.zip’ saved [632351052/632351052]\n", - "\n", - "--2023-07-25 14:29:16-- https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 94256217 (90M) [application/zip]\n", - "Saving to: ‘data/image-embeddings.json.zip.1’\n", - "\n", - "image-embeddings.js 100%[===================>] 89.89M 164MB/s in 0.5s \n", - "\n", - "2023-07-25 14:29:16 (164 MB/s) - ‘data/image-embeddings.json.zip.1’ saved [94256217/94256217]\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# Unzip downloaded files\n", - "UNSPLASH_ZIP_FILE=\"data/unsplash-research-dataset-lite-1.2.0.zip\"\n", - "EMBEDDINGS_ZIP_FILE=\"data/image-embeddings.json.zip\"\n", - "\n", - "with zipfile.ZipFile(UNSPLASH_ZIP_FILE, 'r') as zip_ref:\n", - " print('Extracting file ', UNSPLASH_ZIP_FILE, '.')\n", - " zip_ref.extractall('data/unsplash/')\n", - "\n", - "with zipfile.ZipFile(EMBEDDINGS_ZIP_FILE, 'r') as zip_ref:\n", - " print('Extracting file ', EMBEDDINGS_ZIP_FILE, '.')\n", - " zip_ref.extractall(\"data/embeddings/\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MBh4AQ8i7C0-", - "outputId": "17a50b7f-f052-4b72-daa8-0e8fc630326f" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Extracting file data/unsplash-research-dataset-lite-1.2.0.zip .\n", - "Extracting file data/image-embeddings.json.zip .\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Image embeddings\n", - "We have now 2 options how to proceed.\n", - "1. Import all pregenerated image embeddings (~19k). This is faster option with a lot of images available in a short time.\n", - "2. Import a small subset of randomly choosen images to see the process of generating of image embeddings using external Clip model." - ], - "metadata": { - "id": "p6H7QYctQQA7" - } - }, - { - "cell_type": "code", - "source": [ - "# define helper function\n", - "def gen_rows(df):\n", - " for doc in df.to_dict(orient='records'):\n", - " yield doc" - ], - "metadata": { - "id": "03YvC-_JY9OE" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## 1. Import all pregenerated image embeddings\n", - "This option lets you inport ~19k documents woth pregenenerated image embeddings with metadata.\n", - "\n", - "The process downloads files with images information, merge them and index into Elasticsearch." - ], - "metadata": { - "id": "qhZRdUyAQd-s" - } - }, - { - "cell_type": "code", - "source": [ - "df_unsplash = pd.read_csv('data/unsplash/' + 'photos.tsv000', sep='\\t', header=0)\n", - "\n", - "# follwing 8 lines are fix for inconsistent/incorrect data\n", - "df_unsplash['photo_description'].fillna('', inplace=True)\n", - "df_unsplash['ai_description'].fillna('', inplace=True)\n", - "df_unsplash['photographer_first_name'].fillna('', inplace=True)\n", - "df_unsplash['photographer_last_name'].fillna('', inplace=True)\n", - "df_unsplash['photographer_username'].fillna('', inplace=True)\n", - "df_unsplash['exif_camera_make'].fillna('', inplace=True)\n", - "df_unsplash['exif_camera_model'].fillna('', inplace=True)\n", - "df_unsplash['exif_iso'].fillna(0, inplace=True)\n", - "## end of fix\n", - "\n", - "# read subset of columns from the original/downloaded dataset\n", - "df_unsplash_subset = df_unsplash[\n", - " ['photo_id', 'photo_url', 'photo_image_url', 'photo_description', 'ai_description', 'photographer_first_name',\n", - " 'photographer_last_name', 'photographer_username', 'exif_camera_make', 'exif_camera_model', 'exif_iso']]\n", - "\n", - "# read all pregenerated embeddings\n", - "df_embeddings = pd.read_json('data/embeddings/' + 'image-embeddings.json', lines=True)\n", - "\n", - "df_merged = pd.merge(df_unsplash_subset, df_embeddings,\n", - " on='photo_id',\n", - " how='inner')\n", - "\n", - "count = 0\n", - "for success, info in parallel_bulk(\n", - " client=es,\n", - " actions=gen_rows(df_merged),\n", - " thread_count=5,\n", - " chunk_size=1000,\n", - " index=INDEX_NAME\n", - "):\n", - " if success:\n", - " count += 1\n", - " if count % 1000 == 0:\n", - " print('Indexed %s documents' % str(count), flush=True)\n", - " sys.stdout.flush()\n", - " else:\n", - " print('Doc failed', info)\n", - "\n", - "print('Indexed %s image embeddings documents' % str(count), flush=True)\n", - "sys.stdout.flush()" - ], - "metadata": { - "id": "32xrbSUXTODQ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## 2. Import small number of random\n", - "In this part you will randomly choose small number of images and then generate image embeddings. The script will then index documents into Elasticserach." - ], - "metadata": { - "id": "xypgh4lFQmkc" - } - }, - { - "cell_type": "code", - "source": [ - "NUMBER_OF_IMAGES=20\n", - "INDEX_NAME=\"images-test\"\n", - "\n", - "df_unsplash = pd.read_csv('data/unsplash/' + 'photos.tsv000', sep='\\t', header=0)\n", - "\n", - "## stat fix\n", - "# follwing 8 lines are fix for inconsistent/incorrect data\n", - "df_unsplash['photo_description'].fillna('', inplace=True)\n", - "df_unsplash['ai_description'].fillna('', inplace=True)\n", - "df_unsplash['photographer_first_name'].fillna('', inplace=True)\n", - "df_unsplash['photographer_last_name'].fillna('', inplace=True)\n", - "df_unsplash['photographer_username'].fillna('', inplace=True)\n", - "df_unsplash['exif_camera_make'].fillna('', inplace=True)\n", - "df_unsplash['exif_camera_model'].fillna('', inplace=True)\n", - "df_unsplash['exif_iso'].fillna(0, inplace=True)\n", - "## end of fix\n", - "\n", - "df_unsplash_subset = df_unsplash[\n", - " ['photo_id', 'photo_url', 'photo_image_url', 'photo_description', 'ai_description', 'photographer_first_name',\n", - " 'photographer_last_name', 'photographer_username', 'exif_camera_make', 'exif_camera_model', 'exif_iso']]\n", - "\n", - "df_random_subset = df_unsplash_subset.sample(n=NUMBER_OF_IMAGES, replace=False)\n", - "df_random_subset = df_random_subset.reset_index()\n", - "\n", - "# Load model CLIP\n", - "img_model = SentenceTransformer('clip-ViT-B-32')\n", - "\n", - "# new list of image documents for indexing into ES\n", - "lst = []\n", - "if not os.path.exists(\"data/images\"):\n", - " os.mkdir(\"data/images\")\n", - "\n", - "for index, row in df_random_subset.iterrows():\n", - " #open image from url\n", - " img_path = \"data/images/\" + row['photo_id']\n", - " try:\n", - " urllib.request.urlretrieve(row['photo_image_url'], img_path)\n", - " print(row['photo_id'] + \" \" + row['photo_url'])\n", - " except urllib.error.HTTPError as err:\n", - " if err.code == 404:\n", - " print('404 error: Image not found at {}'.format(row['photo_image_url']))\n", - " else:\n", - " raise\n", - "\n", - " img = Image.open(img_path)\n", - " # create doc\n", - " doc = {}\n", - " embedding = img_model.encode(img)\n", - " doc['photo_id'] = row['photo_id']\n", - " doc['image_embedding'] = embedding.tolist()\n", - " lst.append(doc)\n", - " # print(doc)\n", - "\n", - " # Image cleanup.\n", - " # If file exists, delete it.\n", - " if os.path.exists(img_path):\n", - " os.remove(img_path)\n", - "\n", - "# read all pregenerated embeddings\n", - "df_embeddings = pd.read_json('data/embeddings/' + 'image-embeddings.json', lines=True)\n", - "\n", - "df_merged = pd.merge(df_random_subset, pd.DataFrame(lst),\n", - " on='photo_id',\n", - " how='inner')\n", - "# print(df_merged)\n", - "\n", - "count = 0\n", - "for success, info in parallel_bulk(\n", - " client=es,\n", - " actions=gen_rows(df_merged),\n", - " thread_count=5,\n", - " chunk_size=10,\n", - " index=INDEX_NAME\n", - "):\n", - " if success:\n", - " count += 1\n", - " if count % 10 == 0:\n", - " print('Indexed %s documents' % str(count), flush=True)\n", - " sys.stdout.flush()\n", - " else:\n", - " print('Doc failed', info)\n", - "\n", - "print('Indexed %s image embeddings documents' % str(count), flush=True)\n", - "sys.stdout.flush()" - ], - "metadata": { - "id": "r_txQjP2RKnr" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Query the image dataset\n", - "The next step is to run a query to search for images. The example query searches for `\"model_text\": \"Valentine day flowers\"` using the model we uploaded to Elasticsearch `sentence-transformers__clip-vit-b-32-multilingual-v1`.\n", - "\n", - "The process is one query even it internaly consists of two tasks. One is to tramsform your search text into a vector using the NLP model and the second task is to run the vector search over the image dataset.\n", - "```\n", - "POST images/_search\n", - "{\n", - " \"knn\": {\n", - " \"field\": \"image_embedding\",\n", - " \"k\": 5,\n", - " \"num_candidates\": 10,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__clip-vit-b-32-multilingual-v1\",\n", - " \"model_text\": \"Valentine day flowers\"\n", - " }\n", - " }\n", - " },\n", - " \"fields\": [\n", - " \"photo_description\",\n", - " \"ai_description\",\n", - " \"photo_url\"\n", - " ],\n", - " \"_source\": false\n", - "}\n", - "```\n", - "\n" - ], - "metadata": { - "id": "-_i2CIpSz9vw" - } - }, - { - "cell_type": "code", - "source": [ - "# Search queary\n", - "WHAT_ARE_YOU_LOOKING_FOR=\"Valentine day flowers\"\n", - "INDEX_IM_EMBED=\"images\"\n", - "\n", - "source_fields = [\"photo_description\", \"ai_description\", \"photo_url\", \"photo_image_url\", \"photographer_first_name\",\n", - " \"photographer_username\", \"photographer_last_name\", \"photo_id\"]\n", - "query = {\n", - " \"field\": \"image_embedding\",\n", - " \"k\": 5,\n", - " \"num_candidates\": 100,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__clip-vit-b-32-multilingual-v1\",\n", - " \"model_text\": WHAT_ARE_YOU_LOOKING_FOR\n", - " }\n", - " }\n", - "}\n", - "\n", - "response = es.search(\n", - " index=INDEX_IM_EMBED,\n", - " fields=source_fields,\n", - " knn=query, source=False)\n", - "\n", - "print(response.body)\n", - "\n", - "# the code writes the response into a file for the streamlit UI used in the optional step.\n", - "with open('json_data.json', 'w') as outfile:\n", - " json.dump(response.body['hits']['hits'], outfile)\n", - "\n", - "# Use the `loads()` method to load the JSON data\n", - "dfr = json.loads(json.dumps(response.body['hits']['hits']))\n", - "# Pass the generated JSON data into a pandas dataframe\n", - "dfr = pd.DataFrame(dfr)\n", - "# Print the data frame\n", - "dfr\n", - "\n", - "results = pd.json_normalize(json.loads(json.dumps(response.body['hits']['hits'])))\n", - "# results\n", - "results[['_id', '_score', 'fields.photo_id', 'fields.photo_image_url',\n", - " 'fields.photo_description', 'fields.photographer_first_name',\n", - " 'fields.photographer_last_name', 'fields.ai_description',\n", - " 'fields.photo_url']]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 375 - }, - "id": "wdicpvRlzmXG", - "outputId": "00550041-0aed-4f51-ccd3-18eb705ff7ed" - }, - "execution_count": 35, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'took': 114, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 0.646751, 'hits': [{'_index': 'images', '_id': 'nK5Fh4kBLg4Kd5ySLbKC', '_score': 0.646751, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1587404787163-d03a28fcc1f0'], 'photo_id': ['gQFZxLe3m4g'], 'photographer_first_name': ['Vadim'], 'photo_description': ['instagram.com/vadimsadovski'], 'photographer_last_name': ['Sadovski'], 'photo_url': ['https://unsplash.com/photos/gQFZxLe3m4g'], 'photographer_username': ['vadimsadovski'], 'ai_description': ['']}}, {'_index': 'images', '_id': 'Xa5Eh4kBLg4Kd5yS84Qf', '_score': 0.64675057, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1574259148543-dd376a61339f'], 'photo_id': ['g5Mhx29yp-A'], 'photographer_first_name': ['Erin'], 'photo_description': ['Cute but grumpy cat in the Austrian mountains'], 'photographer_last_name': ['East'], 'photo_url': ['https://unsplash.com/photos/g5Mhx29yp-A'], 'photographer_username': ['mserineast'], 'ai_description': ['brown Persian cat on white bench']}}, {'_index': 'images', '_id': '265Eh4kBLg4Kd5yS84Uf', '_score': 0.64244866, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1565772101068-dec21f7b36eb'], 'photo_id': ['9KZ0PGNCxNE'], 'photographer_first_name': ['Perchek'], 'photo_description': [''], 'photographer_last_name': ['Industrie'], 'photo_url': ['https://unsplash.com/photos/9KZ0PGNCxNE'], 'photographer_username': ['perchek_industrie'], 'ai_description': ['siamese cat']}}, {'_index': 'images', '_id': 'xq5Fh4kBLg4Kd5ySEpuC', '_score': 0.64216036, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1589994205353-325f40210d02'], 'photo_id': ['VOZQkkKXvY4'], 'photographer_first_name': ['Andrey'], 'photo_description': [''], 'photographer_last_name': ['Svistunov'], 'photo_url': ['https://unsplash.com/photos/VOZQkkKXvY4'], 'photographer_username': ['svistal13'], 'ai_description': ['orange tabby cat on ground covered with snow during daytime']}}, {'_index': 'images', '_id': 'WK5Eh4kBLg4Kd5yS5XcD', '_score': 0.64185303, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1556820161-b605d166fce1'], 'photo_id': ['wmz8y6L6c_k'], 'photographer_first_name': ['Phillip'], 'photo_description': [''], 'photographer_last_name': ['Suitcases'], 'photo_url': ['https://unsplash.com/photos/wmz8y6L6c_k'], 'photographer_username': ['nillait'], 'ai_description': ['brown and black kitten close-up photography']}}]}}\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " _id _score fields.photo_id \\\n", - "0 nK5Fh4kBLg4Kd5ySLbKC 0.646751 [gQFZxLe3m4g] \n", - "1 Xa5Eh4kBLg4Kd5yS84Qf 0.646751 [g5Mhx29yp-A] \n", - "2 265Eh4kBLg4Kd5yS84Uf 0.642449 [9KZ0PGNCxNE] \n", - "3 xq5Fh4kBLg4Kd5ySEpuC 0.642160 [VOZQkkKXvY4] \n", - "4 WK5Eh4kBLg4Kd5yS5XcD 0.641853 [wmz8y6L6c_k] \n", - "\n", - " fields.photo_image_url \\\n", - "0 [https://images.unsplash.com/photo-15874047871... \n", - "1 [https://images.unsplash.com/photo-15742591485... \n", - "2 [https://images.unsplash.com/photo-15657721010... \n", - "3 [https://images.unsplash.com/photo-15899942053... \n", - "4 [https://images.unsplash.com/photo-1556820161-... \n", - "\n", - " fields.photo_description \\\n", - "0 [instagram.com/vadimsadovski] \n", - "1 [Cute but grumpy cat in the Austrian mountains] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "\n", - " fields.photographer_first_name fields.photographer_last_name \\\n", - "0 [Vadim] [Sadovski] \n", - "1 [Erin] [East] \n", - "2 [Perchek] [Industrie] \n", - "3 [Andrey] [Svistunov] \n", - "4 [Phillip] [Suitcases] \n", - "\n", - " fields.ai_description \\\n", - "0 [] \n", - "1 [brown Persian cat on white bench] \n", - "2 [siamese cat] \n", - "3 [orange tabby cat on ground covered with snow ... \n", - "4 [brown and black kitten close-up photography] \n", - "\n", - " fields.photo_url \n", - "0 [https://unsplash.com/photos/gQFZxLe3m4g] \n", - "1 [https://unsplash.com/photos/g5Mhx29yp-A] \n", - "2 [https://unsplash.com/photos/9KZ0PGNCxNE] \n", - "3 [https://unsplash.com/photos/VOZQkkKXvY4] \n", - "4 [https://unsplash.com/photos/wmz8y6L6c_k] " - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_id_scorefields.photo_idfields.photo_image_urlfields.photo_descriptionfields.photographer_first_namefields.photographer_last_namefields.ai_descriptionfields.photo_url
0nK5Fh4kBLg4Kd5ySLbKC0.646751[gQFZxLe3m4g][https://images.unsplash.com/photo-15874047871...[instagram.com/vadimsadovski][Vadim][Sadovski][][https://unsplash.com/photos/gQFZxLe3m4g]
1Xa5Eh4kBLg4Kd5yS84Qf0.646751[g5Mhx29yp-A][https://images.unsplash.com/photo-15742591485...[Cute but grumpy cat in the Austrian mountains][Erin][East][brown Persian cat on white bench][https://unsplash.com/photos/g5Mhx29yp-A]
2265Eh4kBLg4Kd5yS84Uf0.642449[9KZ0PGNCxNE][https://images.unsplash.com/photo-15657721010...[][Perchek][Industrie][siamese cat][https://unsplash.com/photos/9KZ0PGNCxNE]
3xq5Fh4kBLg4Kd5ySEpuC0.642160[VOZQkkKXvY4][https://images.unsplash.com/photo-15899942053...[][Andrey][Svistunov][orange tabby cat on ground covered with snow ...[https://unsplash.com/photos/VOZQkkKXvY4]
4WK5Eh4kBLg4Kd5yS5XcD0.641853[wmz8y6L6c_k][https://images.unsplash.com/photo-1556820161-...[][Phillip][Suitcases][brown and black kitten close-up photography][https://unsplash.com/photos/wmz8y6L6c_k]
\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 35 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# [Optional] Simple streamlit UI\n", - "In the following section, you will view the response in a simple UI for better visualisation.\n", - "\n", - "The query in the previous step did write down a file response `json_data.json` for the UI to load and visualise.\n", - "\n", - "Follow the steps below to see the results in a table." - ], - "metadata": { - "id": "Ry62sfHFHFi9" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Install tunnel library" - ], - "metadata": { - "id": "iUAbRqr8II-x" - } - }, - { - "cell_type": "code", - "source": [ - "!npm install localtunnel" - ], - "metadata": { - "id": "RGEmAt2DjtN7", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "f6c37d54-7e09-4e59-fc21-8a3db4fa840d" - }, - "execution_count": 12, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[K\u001b[?25h\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35msaveError\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[34;40mnotice\u001b[0m\u001b[35m\u001b[0m created a lockfile as package-lock.json. You should commit this file.\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35menoent\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No description\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No repository field.\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No README data\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No license field.\n", - "\u001b[0m\n", - "\u001b[K\u001b[?25h+ localtunnel@2.0.2\n", - "added 22 packages from 22 contributors and audited 22 packages in 5.903s\n", - "\n", - "3 packages are looking for funding\n", - " run `npm fund` for details\n", - "\n", - "found \u001b[92m0\u001b[0m vulnerabilities\n", - "\n", - "\u001b[K\u001b[?25h" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Create application" - ], - "metadata": { - "id": "KUAfucnYITka" - } - }, - { - "cell_type": "code", - "source": [ - "%%writefile app.py\n", - "\n", - "import streamlit as st\n", - "import json\n", - "import pandas as pd\n", - "\n", - "\n", - "def get_image_preview(image_url):\n", - " \"\"\"Returns an HTML tag with preview of the image.\"\"\"\n", - " return f\"\"\"\"\"\"\n", - "\n", - "\n", - "def get_url_link(photo_url):\n", - " \"\"\"Returns an HTML tag to the image page.\"\"\"\n", - " return f\"\"\" {photo_url} \"\"\"\n", - "\n", - "\n", - "def main():\n", - " \"\"\"Creates a Streamlit app with a table of images.\"\"\"\n", - " data = json.load(open(\"json_data.json\"))\n", - " table = []\n", - " for image in data:\n", - " image_url = image[\"fields\"][\"photo_image_url\"][0]\n", - " image_preview = get_image_preview(image_url)\n", - " photo_url = image[\"fields\"][\"photo_url\"][0]\n", - " photo_url_link = get_url_link(photo_url)\n", - " table.append([image_preview, image[\"fields\"][\"photo_id\"][0],\n", - " image[\"fields\"][\"photographer_first_name\"][0],\n", - " image[\"fields\"][\"photographer_last_name\"][0],\n", - " image[\"fields\"][\"photographer_username\"][0],\n", - " photo_url_link])\n", - "\n", - " st.write(pd.DataFrame(table, columns=[\"Image\", \"ID\", \"First Name\", \"Last Name\",\n", - " \"Photographer username\", \"Photo url\"]).to_html(escape = False),\n", - " unsafe_allow_html=True)\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " main()\n", - "\n" - ], - "metadata": { - "id": "9Wb7GOWMXFnF", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "6db23ef3-b25e-4f80-a3cb-6d08c1c78c16" - }, - "execution_count": 36, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Overwriting app.py\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Run app\n", - "Run the application and check your IP for the tunneling" - ], - "metadata": { - "id": "CjDhvbGhHuiz" - } - }, - { - "cell_type": "code", - "source": [ - "!streamlit run app.py &>/content/logs.txt & curl ipv4.icanhazip.com" - ], - "metadata": { - "id": "851CeYi8jvuF", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "46a64023-e990-4900-f482-5558237f08cc" - }, - "execution_count": 37, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "34.138.156.22\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Create the tunnel\n", - "Run the tunnel and use the link below to connect to the tunnel.\n", - "\n", - "Use the IP from the previous step to connect to the application" - ], - "metadata": { - "id": "4OuSLFHyHy5M" - } - }, - { - "cell_type": "code", - "source": [ - "!npx localtunnel --port 8501" - ], - "metadata": { - "id": "inF7ceBmjyE3", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "559ce180-3f0f-4475-c9a9-46dc91389276" - }, - "execution_count": 38, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[K\u001b[?25hnpx: installed 22 in 2.186s\n", - "your url is: https://nine-facts-act.loca.lt\n", - "^C\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Resources\n", - "\n", - "Blog: https://www.elastic.co/blog/implement-image-similarity-search-elastic\n", - "\n", - "GH : https://github.com/radoondas/flask-elastic-image-search\n" - ], - "metadata": { - "id": "SbxbVzvQ7caR" - } - } - ] -} \ No newline at end of file From 9817f9fe0faa230ea9ddb50588ac84e711b09f98 Mon Sep 17 00:00:00 2001 From: Miguel Grinberg Date: Tue, 19 Mar 2024 14:56:32 +0000 Subject: [PATCH 3/3] fixed some typos and simplified the instructions --- bin/find-notebooks-to-test.sh | 1 + notebooks/images/image-similarity.ipynb | 1113 ++++++----------------- 2 files changed, 273 insertions(+), 841 deletions(-) diff --git a/bin/find-notebooks-to-test.sh b/bin/find-notebooks-to-test.sh index 76840866..7e9513ce 100755 --- a/bin/find-notebooks-to-test.sh +++ b/bin/find-notebooks-to-test.sh @@ -4,6 +4,7 @@ EXEMPT_NOTEBOOKS=( "notebooks/esql/esql-getting-started.ipynb" "notebooks/search/07-inference.ipynb" "notebooks/search/08-learning-to-rank.ipynb" + "notebooks/images/image-similarity.ipynb" "notebooks/langchain/langchain-vector-store.ipynb" "notebooks/langchain/self-query-retriever-examples/chatbot-example.ipynb" "notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb" diff --git a/notebooks/images/image-similarity.ipynb b/notebooks/images/image-similarity.ipynb index 46e01eb1..78be723f 100644 --- a/notebooks/images/image-similarity.ipynb +++ b/notebooks/images/image-similarity.ipynb @@ -1,53 +1,41 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", - "source": [ - "# How to implement Image search using Elasticsearch" - ], "metadata": { "id": "CepGq3Kvtdxi" - } + }, + "source": [ + "# How to implement Image search using Elasticsearch" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "oMu1SW_TQQrU" + }, "source": [ "The workbook shows how to implement an Image search using Elasticsearch. You will index documents with image embeddings (generated or pre-generated) and then using NLP model be able to search using natural language description of the image.\n", "\n", - "### Prerequisities\n", - "Before you start make sure you have Elasticsearch cluster running. The cluster must have at least one machine learning (ML) node with enough (4GB) memory." - ], - "metadata": { - "id": "oMu1SW_TQQrU" - } + "## Prerequisities\n", + "Before we begin, create an elastic cloud deployment and [autoscale](https://www.elastic.co/guide/en/cloud/current/ec-autoscaling.html) to have least one machine learning (ML) node with enough (4GB) memory. Also ensure that the Elasticsearch cluster is running. \n", + "\n", + "If you don't already have an Elastic deployment, you can sign up for a free [Elastic Cloud trial](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook)." + ] }, { "cell_type": "markdown", + "metadata": { + "id": "VFcdr8IDQE_H" + }, "source": [ "### Install Python requirements\n", "Before you start you need to install all required Python dependencies." - ], - "metadata": { - "id": "VFcdr8IDQE_H" - } + ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -55,143 +43,18 @@ "id": "6WosfR55npKU", "outputId": "033767ff-0eef-48cc-c9e7-efbf73c9cb67" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", - "Requirement already satisfied: eland in /usr/local/lib/python3.10/dist-packages (8.7.0)\n", - "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.9.0)\n", - "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.31.0)\n", - "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.65.0)\n", - "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (9.4.0)\n", - "Requirement already satisfied: streamlit in /usr/local/lib/python3.10/dist-packages (1.25.0)\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.15.2+cu118)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.22.4)\n", - "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.10.1)\n", - "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", - "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n", - "Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.4)\n", - "Requirement already satisfied: pandas>=1.5 in /usr/local/lib/python3.10/dist-packages (from eland) (1.5.3)\n", - "Requirement already satisfied: matplotlib>=3.6 in /usr/local/lib/python3.10/dist-packages (from eland) (3.7.1)\n", - "Requirement already satisfied: elastic-transport<9,>=8 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.4.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n", - "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.1)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.7.1)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", - "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", - "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", - "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.6)\n", - "Requirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.2.2)\n", - "Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit) (1.4)\n", - "Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.3.1)\n", - "Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.1.6)\n", - "Requirement already satisfied: importlib-metadata<7,>=1.4 in /usr/lib/python3/dist-packages (from streamlit) (4.6.4)\n", - "Requirement already satisfied: protobuf<5,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.20.3)\n", - "Requirement already satisfied: pyarrow>=6.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.0.0)\n", - "Requirement already satisfied: pympler<2,>=0.9 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.0.1)\n", - "Requirement already satisfied: python-dateutil<3,>=2.7.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.8.2)\n", - "Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (13.4.2)\n", - "Requirement already satisfied: tenacity<9,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.2.2)\n", - "Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.10.2)\n", - "Requirement already satisfied: tzlocal<5,>=1.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.3.1)\n", - "Requirement already satisfied: validators<1,>=0.2 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.20.0)\n", - "Requirement already satisfied: gitpython!=3.1.19,<4,>=3.0.7 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.1.32)\n", - "Requirement already satisfied: pydeck<1,>=0.8 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.8.0)\n", - "Requirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.3.1)\n", - "Requirement already satisfied: watchdog>=2.1.5 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.0.0)\n", - "Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.4)\n", - "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (4.3.3)\n", - "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.12.0)\n", - "Requirement already satisfied: urllib3<2,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (1.26.16)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2023.7.22)\n", - "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from gitpython!=3.1.19,<4,>=3.0.7->streamlit) (4.0.10)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.6.0)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.1.0)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (0.11.0)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (4.41.1)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.4.4)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (3.1.0)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.5->eland) (2022.7.1)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3,>=2.7.3->streamlit) (1.16.0)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (3.0.0)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (2.14.0)\n", - "Requirement already satisfied: pytz-deprecation-shim in /usr/local/lib/python3.10/dist-packages (from tzlocal<5,>=1.1->streamlit) (0.1.0.post0)\n", - "Requirement already satisfied: decorator>=3.4.0 in /usr/local/lib/python3.10/dist-packages (from validators<1,>=0.2->streamlit) (4.4.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.1)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", - "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit) (5.0.0)\n", - "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (23.1.0)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.19.3)\n", - "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit) (0.1.2)\n", - "Requirement already satisfied: tzdata in /usr/local/lib/python3.10/dist-packages (from pytz-deprecation-shim->tzlocal<5,>=1.1->streamlit) (2023.3)\n" - ] - } - ], + "outputs": [], "source": [ "!pip install sentence-transformers eland elasticsearch transformers torch tqdm Pillow streamlit" ] }, - { - "cell_type": "markdown", - "source": [ - "### Upload NLP model for querying\n", - "Using an `eland` library you will import NLP CLIP model. The model will tran\n", - "sfer your search query into vector which will be used for the search over the set of images stored in Elasticsearch.\n", - "\n", - "The model used is [clip-ViT-B-32-multilingual-v1](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1) because the image embeddings are also generated by the CLIP model.\n", - "\n", - "How to get cloud id? Go to [ESS cloud](https://cloud.elastic.co/logout?redirectTo=%2Fhome&reason=unauthorised) and `On the deployment overview page, copy down the Cloud ID.`\n", - "\n", - "The authentication is using api key (`--es-api-key`). Learn how to generate [API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html#create-api-key).\n", - "```\n", - "$ eland_import_hub_model --cloud-id $CLOUD_ID \\\n", - " --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 \\\n", - " --task-type text_embedding --es-api-key $API_KEY --start\n", - "```" - ], - "metadata": { - "id": "eIV5lAnVt9L7" - } - }, { "cell_type": "code", - "source": [ - "API_KEY = \"\"\n", - "CLOUD_ID = \"\"\n", - "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 --task-type text_embedding --es-api-key API_KEY --start" - ], + "execution_count": 3, "metadata": { - "id": "tVhL9jBnuAAQ" + "id": "I0pRCbYMuMVn" }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Import Python libraries" - ], - "metadata": { - "id": "hVxWnFflUCZv" - } - }, - { - "cell_type": "code", + "outputs": [], "source": [ "from elasticsearch import Elasticsearch\n", "from elasticsearch.helpers import parallel_bulk\n", @@ -199,7 +62,6 @@ "import os\n", "import sys\n", "\n", - "# import shutil\n", "import zipfile\n", "from tqdm.auto import tqdm\n", "import pandas as pd\n", @@ -210,88 +72,139 @@ "# import urllib.error\n", "import json\n", "from getpass import getpass" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { - "id": "I0pRCbYMuMVn" + "id": "eIV5lAnVt9L7" }, - "execution_count": 17, - "outputs": [] + "source": [ + "### Upload NLP model for querying\n", + "\n", + "Using the [`eland_import_hub_model`](https://www.elastic.co/guide/en/elasticsearch/client/eland/current/machine-learning.html#ml-nlp-pytorch) script, download and install the [clip-ViT-B-32-multilingual-v1](https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1) model, will transfer your search query into vector which will be used for the search over the set of images stored in Elasticsearch.\n", + "\n", + "To get your cloud id, go to [Elastic cloud](https://cloud.elastic.co) and `On the deployment overview page, copy down the Cloud ID.`\n", + "\n", + "To authenticate your request, You could use [API key](https://www.elastic.co/guide/en/kibana/current/api-keys.html#create-api-key). Alternatively, you can use your cloud deployment username and password." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tVhL9jBnuAAQ" + }, + "outputs": [], + "source": [ + "!eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --hub-model-id sentence-transformers/clip-ViT-B-32-multilingual-v1 --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "Klv3rywdUJBN" + }, "source": [ "### Connect to Elasticsearch cluster\n", "Use your own cluster details `ELASTIC_CLOUD_ID`, `API_KEY`." - ], - "metadata": { - "id": "Klv3rywdUJBN" - } + ] }, { "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YwN8RmFY3FQI", + "outputId": "d0d0e31e-2ad2-46fe-ef8c-8c8bce7e1c48" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'name': 'instance-0000000001', 'cluster_name': 'a72482be54904952ba46d53c3def7740', 'cluster_uuid': 'g8BE52TtT32pGBbRzP_oKA', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# ESS Cloud connection definition using an API_KEY\n", - "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", - "API_KEY = getpass(\"Elastic API key: \")\n", - "\n", - "# ELASTIC_CLOUD_USER = \"elastic\"\n", - "# CLOUD_PASSWORD = getpass(\"Elastic Password\")\n", - "\n", "es = Elasticsearch(\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " # basic_auth=(ELASTIC_CLOUD_USER, ELASTIC_CLOUD_PASSWORD),\n", - " api_key=API_KEY,\n", + " api_key=ELASTIC_API_KEY,\n", " request_timeout=600,\n", ")\n", "\n", "es.info() # should return cluster info" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { - "id": "YwN8RmFY3FQI", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "d0d0e31e-2ad2-46fe-ef8c-8c8bce7e1c48" + "id": "IW-GIlH2OxB4" + }, + "source": [ + "### Create Index and mappings for Images\n", + "Befor you can index documents into Elasticsearch, you need to create an Index with correct mappings." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "xAkc1OVcOxy3" }, - "execution_count": 19, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elastic Cloud ID: ··········\n", - "Elastic API key: ··········\n" + "Creating index images\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/b0/0h5fbhnd0tz563nl779m3jv80000gn/T/ipykernel_57417/1485784368.py:45: DeprecationWarning: Passing transport options in the API method is deprecated. Use 'Elasticsearch.options()' instead.\n", + " es.indices.create(\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ - "ObjectApiResponse({'name': 'instance-0000000000', 'cluster_name': 'a597bbe1e0d047c494e7d4015f67ef37', 'cluster_uuid': 'EnT0vwwSSZeAahPw3Vhsuw', 'version': {'number': '8.8.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98e1271edf932a480e4262a471281f1ee295ce6b', 'build_date': '2023-06-26T05:16:16.196344851Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" + "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'images'})" ] }, + "execution_count": 10, "metadata": {}, - "execution_count": 19 + "output_type": "execute_result" } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Create Index and mappings for Images\n", - "Befor you can index documents into Elasticsearch, you need to create an Index with correct mappings." ], - "metadata": { - "id": "IW-GIlH2OxB4" - } - }, - { - "cell_type": "code", "source": [ "# Destination Index name\n", "INDEX_NAME = \"images\"\n", - "# If you want to delete previous version of the Index\n", - "DELETE_INDEX = False\n", + "\n", + "# flag to check if index has to be deleted before creating\n", + "SHOULD_DELETE_INDEX = True\n", "\n", "INDEX_MAPPING = {\n", " \"properties\": {\n", @@ -324,28 +237,23 @@ " }\n", "}\n", "\n", - "if DELETE_INDEX:\n", + "# check if we want to delete index before creating the index\n", + "if SHOULD_DELETE_INDEX:\n", " if es.indices.exists(index=INDEX_NAME):\n", " print(\"Deleting existing %s\" % INDEX_NAME)\n", " es.indices.delete(index=INDEX_NAME, ignore=[400, 404])\n", "\n", - "if not es.indices.exists(index=INDEX_NAME):\n", - " print(\"Creating index %s\" % INDEX_NAME)\n", - " es.indices.create(\n", - " index=INDEX_NAME,\n", - " mappings=INDEX_MAPPING,\n", - " settings=INDEX_SETTINGS,\n", - " ignore=[400, 404],\n", - " )" - ], - "metadata": { - "id": "xAkc1OVcOxy3" - }, - "execution_count": null, - "outputs": [] + "print(\"Creating index %s\" % INDEX_NAME)\n", + "es.indices.create(\n", + " index=INDEX_NAME, mappings=INDEX_MAPPING, settings=INDEX_SETTINGS, ignore=[400, 404]\n", + ")" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "NKE-j0kPUMn_" + }, "source": [ "### Get image dataset and embeddings\n", "Download:\n", @@ -353,17 +261,11 @@ "- The [Image embeddings](https://github.com/radoondas/flask-elastic-nlp/blob/main/embeddings/blogs/blogs-no-embeddings.json.zip) are pre-generated using CLIP model\n", "\n", "Then unzip both files." - ], - "metadata": { - "id": "NKE-j0kPUMn_" - } + ] }, { "cell_type": "code", - "source": [ - "!wget https://unsplash.com/data/lite/1.2.0 -O data/unsplash-research-dataset-lite-1.2.0.zip\n", - "!wget https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip -P data" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -371,49 +273,27 @@ "id": "zFGaPDRR5mqT", "outputId": "0114cdd6-a714-41ab-9b46-3013bd36698a" }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2023-07-25 14:28:32-- https://unsplash.com/data/lite/1.2.0\n", - "Resolving unsplash.com (unsplash.com)... 151.101.65.181, 151.101.1.181, 151.101.129.181, ...\n", - "Connecting to unsplash.com (unsplash.com)|151.101.65.181|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip [following]\n", - "--2023-07-25 14:28:32-- https://unsplash-datasets.s3.amazonaws.com/lite/1.2.0/unsplash-research-dataset-lite-1.2.0.zip\n", - "Resolving unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)... 52.217.102.84, 3.5.25.253, 52.217.96.188, ...\n", - "Connecting to unsplash-datasets.s3.amazonaws.com (unsplash-datasets.s3.amazonaws.com)|52.217.102.84|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 632351052 (603M) [application/zip]\n", - "Saving to: ‘data/unsplash-research-dataset-lite-1.2.0.zip’\n", - "\n", - "data/unsplash-resea 100%[===================>] 603.06M 14.1MB/s in 42s \n", - "\n", - "2023-07-25 14:29:16 (14.2 MB/s) - ‘data/unsplash-research-dataset-lite-1.2.0.zip’ saved [632351052/632351052]\n", - "\n", - "--2023-07-25 14:29:16-- https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 94256217 (90M) [application/zip]\n", - "Saving to: ‘data/image-embeddings.json.zip.1’\n", - "\n", - "image-embeddings.js 100%[===================>] 89.89M 164MB/s in 0.5s \n", - "\n", - "2023-07-25 14:29:16 (164 MB/s) - ‘data/image-embeddings.json.zip.1’ saved [94256217/94256217]\n", - "\n" - ] - } + "outputs": [], + "source": [ + "!curl -L https://unsplash.com/data/lite/1.2.0 -o unsplash-research-dataset-lite-1.2.0.zip\n", + "!curl -L https://raw.githubusercontent.com/radoondas/flask-elastic-nlp/main/embeddings/images/image-embeddings.json.zip -o image-embeddings.json.zip" ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MBh4AQ8i7C0-", + "outputId": "17a50b7f-f052-4b72-daa8-0e8fc630326f" + }, + "outputs": [], "source": [ "# Unzip downloaded files\n", - "UNSPLASH_ZIP_FILE = \"data/unsplash-research-dataset-lite-1.2.0.zip\"\n", - "EMBEDDINGS_ZIP_FILE = \"data/image-embeddings.json.zip\"\n", + "UNSPLASH_ZIP_FILE = \"unsplash-research-dataset-lite-1.2.0.zip\"\n", + "EMBEDDINGS_ZIP_FILE = \"image-embeddings.json.zip\"\n", "\n", "with zipfile.ZipFile(UNSPLASH_ZIP_FILE, \"r\") as zip_ref:\n", " print(\"Extracting file \", UNSPLASH_ZIP_FILE, \".\")\n", @@ -422,66 +302,54 @@ "with zipfile.ZipFile(EMBEDDINGS_ZIP_FILE, \"r\") as zip_ref:\n", " print(\"Extracting file \", EMBEDDINGS_ZIP_FILE, \".\")\n", " zip_ref.extractall(\"data/embeddings/\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MBh4AQ8i7C0-", - "outputId": "17a50b7f-f052-4b72-daa8-0e8fc630326f" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Extracting file data/unsplash-research-dataset-lite-1.2.0.zip .\n", - "Extracting file data/image-embeddings.json.zip .\n" - ] - } ] }, { "cell_type": "markdown", - "source": [ - "# Image embeddings\n", - "We have now 2 options how to proceed.\n", - "1. Import all pregenerated image embeddings (~19k). This is faster option with a lot of images available in a short time.\n", - "2. Import a small subset of randomly choosen images to see the process of generating of image embeddings using external Clip model." - ], "metadata": { - "id": "p6H7QYctQQA7" - } - }, - { - "cell_type": "code", - "source": [ - "# define helper function\n", - "def gen_rows(df):\n", - " for doc in df.to_dict(orient=\"records\"):\n", - " yield doc" - ], - "metadata": { - "id": "03YvC-_JY9OE" + "id": "qhZRdUyAQd-s" }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", "source": [ - "## 1. Import all pregenerated image embeddings\n", - "This option lets you inport ~19k documents woth pregenenerated image embeddings with metadata.\n", + "# Import all pregenerated image embeddings\n", + "In this section you will import ~19k documents worth of pregenenerated image embeddings with metadata.\n", "\n", "The process downloads files with images information, merge them and index into Elasticsearch." - ], - "metadata": { - "id": "qhZRdUyAQd-s" - } + ] }, { "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "32xrbSUXTODQ" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexed 1000 documents\n", + "Indexed 2000 documents\n", + "Indexed 3000 documents\n", + "Indexed 4000 documents\n", + "Indexed 5000 documents\n", + "Indexed 6000 documents\n", + "Indexed 7000 documents\n", + "Indexed 8000 documents\n", + "Indexed 9000 documents\n", + "Indexed 10000 documents\n", + "Indexed 11000 documents\n", + "Indexed 12000 documents\n", + "Indexed 13000 documents\n", + "Indexed 14000 documents\n", + "Indexed 15000 documents\n", + "Indexed 16000 documents\n", + "Indexed 17000 documents\n", + "Indexed 18000 documents\n", + "Indexed 19000 documents\n", + "Indexed 19833 image embeddings documents\n" + ] + } + ], "source": [ "df_unsplash = pd.read_csv(\"data/unsplash/\" + \"photos.tsv000\", sep=\"\\t\", header=0)\n", "\n", @@ -536,134 +404,19 @@ "\n", "print(\"Indexed %s image embeddings documents\" % str(count), flush=True)\n", "sys.stdout.flush()" - ], - "metadata": { - "id": "32xrbSUXTODQ" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## 2. Import small number of random\n", - "In this part you will randomly choose small number of images and then generate image embeddings. The script will then index documents into Elasticserach." - ], - "metadata": { - "id": "xypgh4lFQmkc" - } - }, - { - "cell_type": "code", - "source": [ - "NUMBER_OF_IMAGES = 20\n", - "INDEX_NAME = \"images-test\"\n", - "\n", - "df_unsplash = pd.read_csv(\"data/unsplash/\" + \"photos.tsv000\", sep=\"\\t\", header=0)\n", - "\n", - "## stat fix\n", - "# follwing 8 lines are fix for inconsistent/incorrect data\n", - "df_unsplash[\"photo_description\"].fillna(\"\", inplace=True)\n", - "df_unsplash[\"ai_description\"].fillna(\"\", inplace=True)\n", - "df_unsplash[\"photographer_first_name\"].fillna(\"\", inplace=True)\n", - "df_unsplash[\"photographer_last_name\"].fillna(\"\", inplace=True)\n", - "df_unsplash[\"photographer_username\"].fillna(\"\", inplace=True)\n", - "df_unsplash[\"exif_camera_make\"].fillna(\"\", inplace=True)\n", - "df_unsplash[\"exif_camera_model\"].fillna(\"\", inplace=True)\n", - "df_unsplash[\"exif_iso\"].fillna(0, inplace=True)\n", - "## end of fix\n", - "\n", - "df_unsplash_subset = df_unsplash[\n", - " [\n", - " \"photo_id\",\n", - " \"photo_url\",\n", - " \"photo_image_url\",\n", - " \"photo_description\",\n", - " \"ai_description\",\n", - " \"photographer_first_name\",\n", - " \"photographer_last_name\",\n", - " \"photographer_username\",\n", - " \"exif_camera_make\",\n", - " \"exif_camera_model\",\n", - " \"exif_iso\",\n", - " ]\n", - "]\n", - "\n", - "df_random_subset = df_unsplash_subset.sample(n=NUMBER_OF_IMAGES, replace=False)\n", - "df_random_subset = df_random_subset.reset_index()\n", - "\n", - "# Load model CLIP\n", - "img_model = SentenceTransformer(\"clip-ViT-B-32\")\n", - "\n", - "# new list of image documents for indexing into ES\n", - "lst = []\n", - "if not os.path.exists(\"data/images\"):\n", - " os.mkdir(\"data/images\")\n", - "\n", - "for index, row in df_random_subset.iterrows():\n", - " # open image from url\n", - " img_path = \"data/images/\" + row[\"photo_id\"]\n", - " try:\n", - " urllib.request.urlretrieve(row[\"photo_image_url\"], img_path)\n", - " print(row[\"photo_id\"] + \" \" + row[\"photo_url\"])\n", - " except urllib.error.HTTPError as err:\n", - " if err.code == 404:\n", - " print(\"404 error: Image not found at {}\".format(row[\"photo_image_url\"]))\n", - " else:\n", - " raise\n", - "\n", - " img = Image.open(img_path)\n", - " # create doc\n", - " doc = {}\n", - " embedding = img_model.encode(img)\n", - " doc[\"photo_id\"] = row[\"photo_id\"]\n", - " doc[\"image_embedding\"] = embedding.tolist()\n", - " lst.append(doc)\n", - " # print(doc)\n", - "\n", - " # Image cleanup.\n", - " # If file exists, delete it.\n", - " if os.path.exists(img_path):\n", - " os.remove(img_path)\n", - "\n", - "# read all pregenerated embeddings\n", - "df_embeddings = pd.read_json(\"data/embeddings/\" + \"image-embeddings.json\", lines=True)\n", - "\n", - "df_merged = pd.merge(df_random_subset, pd.DataFrame(lst), on=\"photo_id\", how=\"inner\")\n", - "# print(df_merged)\n", - "\n", - "count = 0\n", - "for success, info in parallel_bulk(\n", - " client=es,\n", - " actions=gen_rows(df_merged),\n", - " thread_count=5,\n", - " chunk_size=10,\n", - " index=INDEX_NAME,\n", - "):\n", - " if success:\n", - " count += 1\n", - " if count % 10 == 0:\n", - " print(\"Indexed %s documents\" % str(count), flush=True)\n", - " sys.stdout.flush()\n", - " else:\n", - " print(\"Doc failed\", info)\n", - "\n", - "print(\"Indexed %s image embeddings documents\" % str(count), flush=True)\n", - "sys.stdout.flush()" - ], "metadata": { - "id": "r_txQjP2RKnr" + "id": "-_i2CIpSz9vw" }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", "source": [ - "### Query the image dataset\n", - "The next step is to run a query to search for images. The example query searches for `\"model_text\": \"Valentine day flowers\"` using the model we uploaded to Elasticsearch `sentence-transformers__clip-vit-b-32-multilingual-v1`.\n", + "# Query the image dataset\n", + "The next step is to run a query to search for images. The example query searches for `\"model_text\": \"Valentine day flowers\"` using the model `sentence-transformers__clip-vit-b-32-multilingual-v1` that we uploaded to Elasticsearch earlier.\n", + "\n", + "The process is carried out with a single query, even though internaly it consists of two tasks. One is to tramsform your search text into a vector using the NLP model and the second task is to run the vector search over the image dataset.\n", "\n", - "The process is one query even it internaly consists of two tasks. One is to tramsform your search text into a vector using the NLP model and the second task is to run the vector search over the image dataset.\n", "```\n", "POST images/_search\n", "{\n", @@ -687,17 +440,23 @@ "}\n", "```\n", "\n" - ], - "metadata": { - "id": "-_i2CIpSz9vw" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 375 + }, + "id": "wdicpvRlzmXG", + "outputId": "00550041-0aed-4f51-ccd3-18eb705ff7ed" + }, + "outputs": [], "source": [ "# Search queary\n", "WHAT_ARE_YOU_LOOKING_FOR = \"Valentine day flowers\"\n", - "INDEX_IM_EMBED = \"images\"\n", "\n", "source_fields = [\n", " \"photo_description\",\n", @@ -721,9 +480,7 @@ " },\n", "}\n", "\n", - "response = es.search(\n", - " index=INDEX_IM_EMBED, fields=source_fields, knn=query, source=False\n", - ")\n", + "response = es.search(index=INDEX_NAME, fields=source_fields, knn=query, source=False)\n", "\n", "print(response.body)\n", "\n", @@ -753,319 +510,13 @@ " \"fields.photo_url\",\n", " ]\n", "]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 375 - }, - "id": "wdicpvRlzmXG", - "outputId": "00550041-0aed-4f51-ccd3-18eb705ff7ed" - }, - "execution_count": 35, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'took': 114, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 0.646751, 'hits': [{'_index': 'images', '_id': 'nK5Fh4kBLg4Kd5ySLbKC', '_score': 0.646751, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1587404787163-d03a28fcc1f0'], 'photo_id': ['gQFZxLe3m4g'], 'photographer_first_name': ['Vadim'], 'photo_description': ['instagram.com/vadimsadovski'], 'photographer_last_name': ['Sadovski'], 'photo_url': ['https://unsplash.com/photos/gQFZxLe3m4g'], 'photographer_username': ['vadimsadovski'], 'ai_description': ['']}}, {'_index': 'images', '_id': 'Xa5Eh4kBLg4Kd5yS84Qf', '_score': 0.64675057, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1574259148543-dd376a61339f'], 'photo_id': ['g5Mhx29yp-A'], 'photographer_first_name': ['Erin'], 'photo_description': ['Cute but grumpy cat in the Austrian mountains'], 'photographer_last_name': ['East'], 'photo_url': ['https://unsplash.com/photos/g5Mhx29yp-A'], 'photographer_username': ['mserineast'], 'ai_description': ['brown Persian cat on white bench']}}, {'_index': 'images', '_id': '265Eh4kBLg4Kd5yS84Uf', '_score': 0.64244866, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1565772101068-dec21f7b36eb'], 'photo_id': ['9KZ0PGNCxNE'], 'photographer_first_name': ['Perchek'], 'photo_description': [''], 'photographer_last_name': ['Industrie'], 'photo_url': ['https://unsplash.com/photos/9KZ0PGNCxNE'], 'photographer_username': ['perchek_industrie'], 'ai_description': ['siamese cat']}}, {'_index': 'images', '_id': 'xq5Fh4kBLg4Kd5ySEpuC', '_score': 0.64216036, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1589994205353-325f40210d02'], 'photo_id': ['VOZQkkKXvY4'], 'photographer_first_name': ['Andrey'], 'photo_description': [''], 'photographer_last_name': ['Svistunov'], 'photo_url': ['https://unsplash.com/photos/VOZQkkKXvY4'], 'photographer_username': ['svistal13'], 'ai_description': ['orange tabby cat on ground covered with snow during daytime']}}, {'_index': 'images', '_id': 'WK5Eh4kBLg4Kd5yS5XcD', '_score': 0.64185303, 'fields': {'photo_image_url': ['https://images.unsplash.com/photo-1556820161-b605d166fce1'], 'photo_id': ['wmz8y6L6c_k'], 'photographer_first_name': ['Phillip'], 'photo_description': [''], 'photographer_last_name': ['Suitcases'], 'photo_url': ['https://unsplash.com/photos/wmz8y6L6c_k'], 'photographer_username': ['nillait'], 'ai_description': ['brown and black kitten close-up photography']}}]}}\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " _id _score fields.photo_id \\\n", - "0 nK5Fh4kBLg4Kd5ySLbKC 0.646751 [gQFZxLe3m4g] \n", - "1 Xa5Eh4kBLg4Kd5yS84Qf 0.646751 [g5Mhx29yp-A] \n", - "2 265Eh4kBLg4Kd5yS84Uf 0.642449 [9KZ0PGNCxNE] \n", - "3 xq5Fh4kBLg4Kd5ySEpuC 0.642160 [VOZQkkKXvY4] \n", - "4 WK5Eh4kBLg4Kd5yS5XcD 0.641853 [wmz8y6L6c_k] \n", - "\n", - " fields.photo_image_url \\\n", - "0 [https://images.unsplash.com/photo-15874047871... \n", - "1 [https://images.unsplash.com/photo-15742591485... \n", - "2 [https://images.unsplash.com/photo-15657721010... \n", - "3 [https://images.unsplash.com/photo-15899942053... \n", - "4 [https://images.unsplash.com/photo-1556820161-... \n", - "\n", - " fields.photo_description \\\n", - "0 [instagram.com/vadimsadovski] \n", - "1 [Cute but grumpy cat in the Austrian mountains] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "\n", - " fields.photographer_first_name fields.photographer_last_name \\\n", - "0 [Vadim] [Sadovski] \n", - "1 [Erin] [East] \n", - "2 [Perchek] [Industrie] \n", - "3 [Andrey] [Svistunov] \n", - "4 [Phillip] [Suitcases] \n", - "\n", - " fields.ai_description \\\n", - "0 [] \n", - "1 [brown Persian cat on white bench] \n", - "2 [siamese cat] \n", - "3 [orange tabby cat on ground covered with snow ... \n", - "4 [brown and black kitten close-up photography] \n", - "\n", - " fields.photo_url \n", - "0 [https://unsplash.com/photos/gQFZxLe3m4g] \n", - "1 [https://unsplash.com/photos/g5Mhx29yp-A] \n", - "2 [https://unsplash.com/photos/9KZ0PGNCxNE] \n", - "3 [https://unsplash.com/photos/VOZQkkKXvY4] \n", - "4 [https://unsplash.com/photos/wmz8y6L6c_k] " - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_id_scorefields.photo_idfields.photo_image_urlfields.photo_descriptionfields.photographer_first_namefields.photographer_last_namefields.ai_descriptionfields.photo_url
0nK5Fh4kBLg4Kd5ySLbKC0.646751[gQFZxLe3m4g][https://images.unsplash.com/photo-15874047871...[instagram.com/vadimsadovski][Vadim][Sadovski][][https://unsplash.com/photos/gQFZxLe3m4g]
1Xa5Eh4kBLg4Kd5yS84Qf0.646751[g5Mhx29yp-A][https://images.unsplash.com/photo-15742591485...[Cute but grumpy cat in the Austrian mountains][Erin][East][brown Persian cat on white bench][https://unsplash.com/photos/g5Mhx29yp-A]
2265Eh4kBLg4Kd5yS84Uf0.642449[9KZ0PGNCxNE][https://images.unsplash.com/photo-15657721010...[][Perchek][Industrie][siamese cat][https://unsplash.com/photos/9KZ0PGNCxNE]
3xq5Fh4kBLg4Kd5ySEpuC0.642160[VOZQkkKXvY4][https://images.unsplash.com/photo-15899942053...[][Andrey][Svistunov][orange tabby cat on ground covered with snow ...[https://unsplash.com/photos/VOZQkkKXvY4]
4WK5Eh4kBLg4Kd5yS5XcD0.641853[wmz8y6L6c_k][https://images.unsplash.com/photo-1556820161-...[][Phillip][Suitcases][brown and black kitten close-up photography][https://unsplash.com/photos/wmz8y6L6c_k]
\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 35 - } ] }, { "cell_type": "markdown", + "metadata": { + "id": "Ry62sfHFHFi9" + }, "source": [ "# [Optional] Simple streamlit UI\n", "In the following section, you will view the response in a simple UI for better visualisation.\n", @@ -1073,70 +524,52 @@ "The query in the previous step did write down a file response `json_data.json` for the UI to load and visualise.\n", "\n", "Follow the steps below to see the results in a table." - ], - "metadata": { - "id": "Ry62sfHFHFi9" - } + ] }, { "cell_type": "markdown", - "source": [ - "### Install tunnel library" - ], "metadata": { "id": "iUAbRqr8II-x" - } + }, + "source": [ + "### Install tunnel library" + ] }, { "cell_type": "code", - "source": [ - "!npm install localtunnel" - ], + "execution_count": null, "metadata": { - "id": "RGEmAt2DjtN7", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "RGEmAt2DjtN7", "outputId": "f6c37d54-7e09-4e59-fc21-8a3db4fa840d" }, - "execution_count": 12, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[K\u001b[?25h\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35msaveError\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[34;40mnotice\u001b[0m\u001b[35m\u001b[0m created a lockfile as package-lock.json. You should commit this file.\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35menoent\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No description\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No repository field.\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No README data\n", - "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No license field.\n", - "\u001b[0m\n", - "\u001b[K\u001b[?25h+ localtunnel@2.0.2\n", - "added 22 packages from 22 contributors and audited 22 packages in 5.903s\n", - "\n", - "3 packages are looking for funding\n", - " run `npm fund` for details\n", - "\n", - "found \u001b[92m0\u001b[0m vulnerabilities\n", - "\n", - "\u001b[K\u001b[?25h" - ] - } + "outputs": [], + "source": [ + "!npm install localtunnel" ] }, { "cell_type": "markdown", - "source": [ - "### Create application" - ], "metadata": { "id": "KUAfucnYITka" - } + }, + "source": [ + "### Create application" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9Wb7GOWMXFnF", + "outputId": "6db23ef3-b25e-4f80-a3cb-6d08c1c78c16" + }, + "outputs": [], "source": [ "%%writefile app.py\n", "\n", @@ -1176,109 +609,107 @@ "\n", "\n", "if __name__ == \"__main__\":\n", - " main()\n", - "\n" - ], - "metadata": { - "id": "9Wb7GOWMXFnF", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "6db23ef3-b25e-4f80-a3cb-6d08c1c78c16" - }, - "execution_count": 36, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Overwriting app.py\n" - ] - } + " main()" ] }, { "cell_type": "markdown", + "metadata": { + "id": "CjDhvbGhHuiz" + }, "source": [ "### Run app\n", "Run the application and check your IP for the tunneling" - ], - "metadata": { - "id": "CjDhvbGhHuiz" - } + ] }, { "cell_type": "code", - "source": [ - "!streamlit run app.py &>/content/logs.txt & curl ipv4.icanhazip.com" - ], + "execution_count": null, "metadata": { - "id": "851CeYi8jvuF", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "851CeYi8jvuF", "outputId": "46a64023-e990-4900-f482-5558237f08cc" }, - "execution_count": 37, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "34.138.156.22\n" - ] - } + "outputs": [], + "source": [ + "!streamlit run app.py &>/content/logs.txt & curl ipv4.icanhazip.com" ] }, { "cell_type": "markdown", + "metadata": { + "id": "4OuSLFHyHy5M" + }, "source": [ "### Create the tunnel\n", "Run the tunnel and use the link below to connect to the tunnel.\n", "\n", "Use the IP from the previous step to connect to the application" - ], - "metadata": { - "id": "4OuSLFHyHy5M" - } + ] }, { "cell_type": "code", - "source": [ - "!npx localtunnel --port 8501" - ], + "execution_count": 38, "metadata": { - "id": "inF7ceBmjyE3", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "inF7ceBmjyE3", "outputId": "559ce180-3f0f-4475-c9a9-46dc91389276" }, - "execution_count": 38, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "\u001b[K\u001b[?25hnpx: installed 22 in 2.186s\n", "your url is: https://nine-facts-act.loca.lt\n", "^C\n" ] } + ], + "source": [ + "!npx localtunnel --port 8501" ] }, { "cell_type": "markdown", + "metadata": { + "id": "SbxbVzvQ7caR" + }, "source": [ "# Resources\n", "\n", "Blog: https://www.elastic.co/blog/implement-image-similarity-search-elastic\n", "\n", "GH : https://github.com/radoondas/flask-elastic-image-search\n" - ], - "metadata": { - "id": "SbxbVzvQ7caR" - } + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 4 +}