chore: add llm hybrid inference use case

andrei-stoian-zama · andrei-stoian-zama · commit 10f80531f748 · 2025-04-02T21:38:08.000+02:00
diff --git a/use_case_examples/llm/GPT2HybridInference.ipynb b/use_case_examples/llm/GPT2HybridInference.ipynb
@@ -0,0 +1,265 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "dfccd8e6",
+   "metadata": {},
+   "source": [
+    "# Fine-Tuning GPT-2 on Encrypted Data with LoRA and Concrete ML\n",
+    "\n",
+    "In this notebook, we perform fine-tuning of a GPT-2 model using LoRA and Concrete ML."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "eca73e44",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch._C.Generator at 0x779ae136e650>"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Import necessary libraries\n",
+    "import math\n",
+    "import os\n",
+    "import random\n",
+    "import shutil\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from datasets import Dataset\n",
+    "from peft import LoraConfig, get_peft_model\n",
+    "from tqdm import tqdm\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments\n",
+    "\n",
+    "from concrete.ml.torch.hybrid_model import HybridFHEModel\n",
+    "\n",
+    "# Set random seed for reproducibility\n",
+    "SEED = 0\n",
+    "torch.manual_seed(SEED)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c082411e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_and_print(prompt, model, tokenizer, seed=None, max_new_tokens=30):\n",
+    "    \"\"\"\n",
+    "    Generates text based on the provided prompt and prints both the prompt and the generated text.\n",
+    "\n",
+    "    Args:\n",
+    "        prompt (str): The input prompt to generate text from.\n",
+    "        model: The pre-trained language model.\n",
+    "        tokenizer: The tokenizer associated with the model.\n",
+    "        seed (int, optional): Seed for random number generators to ensure reproducibility.\n",
+    "        max_new_tokens (int, optional): Maximum number of tokens to generate. Defaults to 30.\n",
+    "    Returns:\n",
+    "        str: The generated text (response only, without the prompt).\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        # Set the environment variable for CuBLAS deterministic behavior\n",
+    "        os.environ[\"CUBLAS_WORKSPACE_CONFIG\"] = \":4096:8\"\n",
+    "\n",
+    "        # Set the random seed for reproducibility\n",
+    "        if seed is not None:\n",
+    "            random.seed(seed)\n",
+    "            np.random.seed(seed)\n",
+    "            torch.manual_seed(seed)\n",
+    "            if torch.cuda.is_available():\n",
+    "                torch.cuda.manual_seed_all(seed)\n",
+    "\n",
+    "        # Encode the input prompt\n",
+    "        inputs = tokenizer.encode_plus(prompt, return_tensors=\"pt\")\n",
+    "\n",
+    "        # Move inputs to the same device as the model\n",
+    "        inputs = {k: v for k, v in inputs.items()}\n",
+    "\n",
+    "        # Generate text\n",
+    "        with torch.no_grad():\n",
+    "            output = model.generate(\n",
+    "                input_ids=inputs[\"input_ids\"],\n",
+    "                attention_mask=inputs[\"attention_mask\"],\n",
+    "                max_new_tokens=max_new_tokens,\n",
+    "                top_p=0.9,\n",
+    "                temperature=0.6,\n",
+    "                do_sample=True,\n",
+    "                pad_token_id=tokenizer.eos_token_id,\n",
+    "            )\n",
+    "\n",
+    "        # Get only the newly generated tokens\n",
+    "        input_length = inputs[\"input_ids\"].shape[1]\n",
+    "        generated_ids = output[0, input_length:]\n",
+    "        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()\n",
+    "\n",
+    "        # Print the prompt and generated text\n",
+    "        print(f\"Prompt: {prompt}\")\n",
+    "        print(f\"Response: {generated_text}\\n\")\n",
+    "\n",
+    "        return generated_text\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error in generation: {str(e)}\")\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8b965a1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load pre-trained GPT-2 model and tokenizer\n",
+    "model_name = \"gpt2\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "\n",
+    "# Ensure tokenizer has a pad token\n",
+    "if tokenizer.pad_token is None:\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "model.config.pad_token_id = model.config.eos_token_id\n",
+    "\n",
+    "# Freeze model weights\n",
+    "for param in model.parameters():\n",
+    "    param.requires_grad = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2337a6b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prompt: Programming is\n",
+      "Response: a skill you need to learn to master.\n",
+      "\n",
+      "Learn to code\n",
+      "\n",
+      "There are a lot of different ways to learn programming.\n",
+      "\n",
+      "The\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = generate_and_print(prompt=\"Programming is\", model=model, tokenizer=tokenizer, seed=SEED)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a138d226",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "\n",
+    "try:\n",
+    "    from transformers import Conv1D as TransformerConv1D\n",
+    "except ImportError:  # pragma: no cover\n",
+    "    TransformerConv1D = None\n",
+    "\n",
+    "# Create a tuple of linear layer classes to check against\n",
+    "LINEAR_LAYERS: tuple = (nn.Linear,)\n",
+    "if TransformerConv1D is not None:\n",
+    "    LINEAR_LAYERS = LINEAR_LAYERS + (TransformerConv1D,)\n",
+    "\n",
+    "remote_names = []\n",
+    "for name, module in model.named_modules():\n",
+    "    # Handle different module types\n",
+    "    if isinstance(module, LINEAR_LAYERS):\n",
+    "        remote_names.append(name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "ae2094a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the HybridFHEModel with the specified remote modules\n",
+    "hybrid_model = HybridFHEModel(model, module_names=remote_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "20dfe2d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c7db65de06c84890a25a4eb44d662bd1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Compiling FHE layers:   0%|          | 0/49 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "BLOCK_SIZE = 32\n",
+    "# Prepare input data for calibration\n",
+    "input_tensor = torch.randint(0, tokenizer.vocab_size, (256, BLOCK_SIZE), dtype=torch.long)\n",
+    "\n",
+    "# Calibrate and compile the model\n",
+    "hybrid_model.compile_model(input_tensor, n_bits=8, use_dynamic_quantization=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65d448c8",
+   "metadata": {},
+   "source": [
+    "Note that our goal is to showcase the use of FHE for encrypted fine-tuning. The dataset consists of 68 examples and a total of 2,386 tokens, which is relatively small. Despite its limited size, which offers little support for the model's learning process, it still manages to produce interesting results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e91ad0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set FHE mode to disable for text generation\n",
+    "hybrid_model.set_fhe_mode(\"disable\")\n",
+    "\n",
+    "_ = generate_and_print(\n",
+    "    prompt=\"Programming is\", model=hybrid_model.model, tokenizer=tokenizer, seed=SEED\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "execution": {
+   "timeout": 10800
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}