feat: add github action to build docker for release (Cinnamon#168) #none

Co-Authored-By: ChengZi <chen.zhang@zilliz.com> Co-Authored-By: kan_cin <kan@cinnamon.is> Co-Authored-By: Tuan Anh Nguyen Dang (Tadashi_Cin) <tadashi@cinnamon.is>
jaredpek · Sep 8, 2024 · 8a40c76 · 8a40c76
1 parent 772186b
commit 8a40c76
Show file tree

Hide file tree

Showing 15 changed files with 400 additions and 26 deletions.
diff --git a/.github/workflows/build-push-docker.yaml b/.github/workflows/build-push-docker.yaml
@@ -0,0 +1,68 @@
+name: Build and Push Docker Image
+
+on:
+  release:
+    types:
+      - created
+
+  push:
+    tags:
+      - "v[0-9]+.[0-9]+.[0-9]+"
+
+  workflow_dispatch:
+
+env:
+  REGISTRY: ghcr.io
+
+jobs:
+  build:
+    name: Build and push container
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
+    strategy:
+      matrix:
+        platform:
+          - linux/amd64
+    steps:
+      - name: Set repository and image name
+        run: |
+          echo "FULL_IMAGE_NAME=${{ env.REGISTRY }}/${IMAGE_NAME,,}" >>${GITHUB_ENV}
+        env:
+          IMAGE_NAME: "${{ github.repository }}"
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Set up Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FULL_IMAGE_NAME }}
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build docker image
+        uses: docker/build-push-action@v4
+        with:
+          file: Dockerfile
+          context: .
+          push: true
+          platforms: ${{ matrix.platform }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          load: true
diff --git a/Dockerfile b/Dockerfile
@@ -17,6 +17,8 @@ RUN apt-get update -qqy && \
       g++ \
       poppler-utils \
       libpoppler-dev \
+      unzip \
+      curl \
     && apt-get clean \
     && apt-get autoremove \
     && rm -rf /var/lib/apt/lists/*
@@ -27,13 +29,18 @@ ENV PYTHONIOENCODING=UTF-8
 
 WORKDIR /app
 
-
 FROM base_image as dev
 
+COPY scripts/download_pdfjs.sh /app/scripts/download_pdfjs.sh
+RUN chmod +x /app/scripts/download_pdfjs.sh
+
+ENV PDFJS_PREBUILT_DIR="/app/libs/ktem/ktem/assets/prebuilt/pdfjs-dist"
+RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR
+
 COPY . /app
 RUN --mount=type=ssh pip install --no-cache-dir -e "libs/kotaemon[all]" \
     && pip install --no-cache-dir -e "libs/ktem" \
     && pip install --no-cache-dir graphrag future \
     && pip install --no-cache-dir "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
 
-ENTRYPOINT ["gradio", "app.py"]
+CMD ["python", "app.py"]
diff --git a/README.md b/README.md
@@ -14,8 +14,8 @@ developers in mind.
 
 [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-31013/)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-<a href="https://hub.docker.com/r/taprosoft/kotaemon" target="_blank">
-<img src="https://img.shields.io/badge/docker_pull-kotaemon:v1.0-brightgreen" alt="docker pull taprosoft/kotaemon:v1.0"></a>
+<a href="https://github.com/Cinnamon/kotaemon" target="_blank">
+<img src="https://img.shields.io/badge/docker_pull-kotaemon:latest-brightgreen" alt="docker pull ghcr.io/cinnamon/kotaemon:latest"></a>
 [![built with Codeium](https://codeium.com/badges/main)](https://codeium.com)
 <a href='https://huggingface.co/spaces/cin-model/kotaemon-demo'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
 
@@ -32,6 +32,7 @@ documents and developers who want to build their own RAG pipeline.
 - For developers:
   - A framework for building your own RAG-based document QA pipeline.
   - Customize and see your RAG pipeline in action with the provided UI (built with <a href='https://github.com/gradio-app/gradio'>Gradio <img src='https://img.shields.io/github/stars/gradio-app/gradio'></a>).
+  - If you use Gradio for development, check out our theme here: [kotaemon-gradio-theme](https://github.com/lone17/kotaemon-gradio-theme).
 
 ```yml
 +----------------------------------------------------------------------------+
@@ -76,7 +77,8 @@ appreciated.
 ### For end users
 
 This document is intended for developers. If you just want to install and use the app as
-it is, please follow the non-technical [User Guide](https://cinnamon.github.io/kotaemon/) (WIP).
+it is, please follow the non-technical [User Guide](https://cinnamon.github.io/kotaemon/).
+Use the most recent release .zip to include latest features and bug-fixes.
 
 ### For developers
 
@@ -89,7 +91,7 @@ docker run \
 -e GRADIO_SERVER_NAME=0.0.0.0 \
 -e GRADIO_SERVER_PORT=7860 \
 -p 7860:7860 -it --rm \
-taprosoft/kotaemon:v1.0
+ghcr.io/cinnamon/kotaemon:latest
 ```
 
 Navigate to `http://localhost:7860/` to access the web UI.

diff --git a/flowsettings.py b/flowsettings.py
@@ -167,14 +167,58 @@
         "default": False,
     }
 
-    KH_EMBEDDINGS["local-bge-en"] = {
+    KH_EMBEDDINGS["fast_embed"] = {
         "spec": {
             "__type__": "kotaemon.embeddings.FastEmbedEmbeddings",
             "model_name": "BAAI/bge-base-en-v1.5",
         },
         "default": False,
     }
 
+# additional LLM configurations
+KH_LLMS["claude"] = {
+    "spec": {
+        "__type__": "kotaemon.llms.chats.LCAnthropicChat",
+        "model_name": "claude-3-5-sonnet-20240620",
+        "api_key": "your-key",
+    },
+    "default": False,
+}
+# KH_LLMS["gemini"] = {
+#     "spec": {
+#         "__type__": "kotaemon.llms.chats.LCGeminiChat",
+#         "model_name": "gemini-1.5-pro",
+#         "api_key": "your-key",
+#     },
+#     "default": False,
+# }
+KH_LLMS["groq"] = {
+    "spec": {
+        "__type__": "kotaemon.llms.ChatOpenAI",
+        "base_url": "https://api.groq.com/openai/v1",
+        "model": "llama-3.1-8b-instant",
+        "api_key": "your-key",
+    },
+    "default": False,
+}
+
+# additional embeddings configurations
+KH_EMBEDDINGS["cohere"] = {
+    "spec": {
+        "__type__": "kotaemon.embeddings.LCCohereEmbeddings",
+        "model": "embed-multilingual-v2.0",
+        "cohere_api_key": "your-key",
+    },
+    "default": False,
+}
+# KH_EMBEDDINGS["huggingface"] = {
+#     "spec": {
+#         "__type__": "kotaemon.embeddings.LCHuggingFaceEmbeddings",
+#         "model_name": "sentence-transformers/all-mpnet-base-v2",
+#     },
+#     "default": False,
+# }
+
 KH_REASONINGS = [
     "ktem.reasoning.simple.FullQAPipeline",
     "ktem.reasoning.simple.FullDecomposeQAPipeline",

diff --git a/libs/kotaemon/kotaemon/llms/__init__.py b/libs/kotaemon/kotaemon/llms/__init__.py
@@ -10,6 +10,7 @@
     LCAnthropicChat,
     LCAzureChatOpenAI,
     LCChatOpenAI,
+    LCGeminiChat,
     LlamaCppChat,
 )
 from .completions import LLM, AzureOpenAI, LlamaCpp, OpenAI
@@ -29,6 +30,7 @@
     "AzureChatOpenAI",
     "ChatOpenAI",
     "LCAnthropicChat",
+    "LCGeminiChat",
     "LCAzureChatOpenAI",
     "LCChatOpenAI",
     "LlamaCppChat",

diff --git a/libs/kotaemon/kotaemon/llms/chats/__init__.py b/libs/kotaemon/kotaemon/llms/chats/__init__.py
@@ -5,6 +5,7 @@
     LCAzureChatOpenAI,
     LCChatMixin,
     LCChatOpenAI,
+    LCGeminiChat,
 )
 from .llamacpp import LlamaCppChat
 from .openai import AzureChatOpenAI, ChatOpenAI
@@ -16,6 +17,7 @@
     "EndpointChatLLM",
     "ChatOpenAI",
     "LCAnthropicChat",
+    "LCGeminiChat",
     "LCChatOpenAI",
     "LCAzureChatOpenAI",
     "LCChatMixin",

diff --git a/libs/kotaemon/kotaemon/llms/chats/langchain_based.py b/libs/kotaemon/kotaemon/llms/chats/langchain_based.py
@@ -245,3 +245,27 @@ def _get_lc_class(self):
             raise ImportError("Please install langchain-anthropic")
 
         return ChatAnthropic
+
+
+class LCGeminiChat(LCChatMixin, ChatLLM):  # type: ignore
+    def __init__(
+        self,
+        api_key: str | None = None,
+        model_name: str | None = None,
+        temperature: float = 0.7,
+        **params,
+    ):
+        super().__init__(
+            google_api_key=api_key,
+            model=model_name,
+            temperature=temperature,
+            **params,
+        )
+
+    def _get_lc_class(self):
+        try:
+            from langchain_google_genai import ChatGoogleGenerativeAI
+        except ImportError:
+            raise ImportError("Please install langchain-google-genai")
+
+        return ChatGoogleGenerativeAI
diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml
@@ -30,8 +30,10 @@ dependencies = [
     "langchain>=0.1.16,<0.2.0",
     "langchain-community>=0.0.34,<0.1.0",
     "langchain-openai>=0.1.4,<0.2.0",
+    "langchain-anthropic",
     "llama-hub>=0.0.79,<0.1.0",
     "llama-index>=0.10.40,<0.11.0",
+    "fastapi<=0.112.1",
     "llama-index-vector-stores-chroma>=0.1.9",
     "llama-index-vector-stores-lancedb",
     "llama-index-vector-stores-milvus",

diff --git a/libs/ktem/ktem/app.py b/libs/ktem/ktem/app.py
@@ -49,8 +49,11 @@ def __init__(self):
             self._js = self._js.replace("KH_APP_VERSION", self.app_version)
         with (dir_assets / "js" / "pdf_viewer.js").open() as fi:
             self._pdf_view_js = fi.read()
+            # workaround for Windows path
+            pdf_js_dist_dir = str(PDFJS_PREBUILT_DIR).replace("\\", "\\\\")
             self._pdf_view_js = self._pdf_view_js.replace(
-                "PDFJS_PREBUILT_DIR", str(PDFJS_PREBUILT_DIR)
+                "PDFJS_PREBUILT_DIR",
+                pdf_js_dist_dir,
             )
 
         self._favicon = str(dir_assets / "img" / "favicon.svg")

diff --git a/libs/ktem/ktem/embeddings/manager.py b/libs/ktem/ktem/embeddings/manager.py
@@ -55,10 +55,18 @@ def load_vendors(self):
         from kotaemon.embeddings import (
             AzureOpenAIEmbeddings,
             FastEmbedEmbeddings,
+            LCCohereEmbeddings,
+            LCHuggingFaceEmbeddings,
             OpenAIEmbeddings,
         )
 
-        self._vendors = [AzureOpenAIEmbeddings, OpenAIEmbeddings, FastEmbedEmbeddings]
+        self._vendors = [
+            AzureOpenAIEmbeddings,
+            OpenAIEmbeddings,
+            FastEmbedEmbeddings,
+            LCCohereEmbeddings,
+            LCHuggingFaceEmbeddings,
+        ]
 
     def __getitem__(self, key: str) -> BaseEmbeddings:
         """Get model by name"""

diff --git a/libs/ktem/ktem/embeddings/ui.py b/libs/ktem/ktem/embeddings/ui.py
@@ -5,6 +5,7 @@
 import yaml
 from ktem.app import BasePage
 from ktem.utils.file import YAMLNoDateSafeLoader
+from theflow.utils.modules import deserialize
 
 from .manager import embedding_models_manager
 
@@ -237,7 +238,7 @@ def on_register_events(self):
 
         self.btn_test_connection.click(
             self.check_connection,
-            inputs=[self.selected_emb_name],
+            inputs=[self.selected_emb_name, self.edit_spec],
             outputs=[self.connection_logs],
         )
 
@@ -330,14 +331,20 @@ def on_btn_delete_click(self):
 
         return btn_delete, btn_delete_yes, btn_delete_no
 
-    def check_connection(self, selected_emb_name):
+    def check_connection(self, selected_emb_name, selected_spec):
         log_content: str = ""
-
         try:
             log_content += f"- Testing model: {selected_emb_name}<br>"
             yield log_content
 
-            emb = embedding_models_manager.get(selected_emb_name)
+            # Parse content & init model
+            info = deepcopy(embedding_models_manager.info()[selected_emb_name])
+
+            # Parse content & create dummy embedding
+            spec = yaml.load(selected_spec, Loader=YAMLNoDateSafeLoader)
+            info["spec"].update(spec)
+
+            emb = deserialize(info["spec"], safe=False)
 
             if emb is None:
                 raise Exception(f"Can not found model: {selected_emb_name}")

diff --git a/libs/ktem/ktem/llms/manager.py b/libs/ktem/ktem/llms/manager.py
@@ -54,9 +54,21 @@ def load(self):
                     self._default = item.name
 
     def load_vendors(self):
-        from kotaemon.llms import AzureChatOpenAI, ChatOpenAI, LlamaCppChat
-
-        self._vendors = [ChatOpenAI, AzureChatOpenAI, LlamaCppChat]
+        from kotaemon.llms import (
+            AzureChatOpenAI,
+            ChatOpenAI,
+            LCAnthropicChat,
+            LCGeminiChat,
+            LlamaCppChat,
+        )
+
+        self._vendors = [
+            ChatOpenAI,
+            AzureChatOpenAI,
+            LCAnthropicChat,
+            LCGeminiChat,
+            LlamaCppChat,
+        ]
 
         for extra_vendor in getattr(flowsettings, "KH_LLM_EXTRA_VENDORS", []):
             self._vendors.append(import_dotted_string(extra_vendor, safe=False))