Minor Issues Fixed

Sangwan70 · Dec 9, 2024 · 66d0729 · 66d0729
1 parent 136817a
commit 66d0729
Show file tree

Hide file tree

Showing 6 changed files with 901 additions and 319 deletions.
diff --git a/part_1/01_main-code/part_1.ipynb b/part_1/01_main-code/part_1.ipynb
@@ -18,19 +18,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "4d1305cf-12d5-46fe-a2c9-36fb71c5b3d3",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch version: 2.4.0\n",
-      "tiktoken version: 0.8.0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from importlib.metadata import version\n",
     "\n",
@@ -138,7 +129,7 @@
     "import os\n",
     "import urllib.request\n",
     "\n",
-    "if not os.path.exists(\"the-verdict.txt\"):\n",
+    "if not os.path.exists(\"wizard_of_oz.txt\"):\n",
     "    url = (\"https://raw.githubusercontent.com/Sangwan70/Building-an-LLM-From-Scratch/refs/heads/main/part_1/01_main-code/wizard_of_oz.txt\")\n",
     "    file_path = \"wizard_of_oz.txt\"\n",
     "    urllib.request.urlretrieve(url, file_path)"
@@ -159,7 +150,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
+    "with open(\"wizard_of_oz.txt\", \"r\", encoding=\"utf-8\") as f:\n",
     "    raw_text = f.read()\n",
     "    \n",
     "print(\"Total number of character:\", len(raw_text))\n",
@@ -462,8 +453,8 @@
    "source": [
     "tokenizer = SimpleTokenizerV1(vocab)\n",
     "\n",
-    "text = \"\"\"\"It's the last he painted, you know,\" \n",
-    "           Mrs. Gisburn said with pardonable pride.\"\"\"\n",
+    "text = \"\"\"Princess Ozma, whom I love as much as my readers do, is again introduced\n",
+    "in this story, and so are several of our old friends of Oz.\"\"\"\n",
     "ids = tokenizer.encode(text)\n",
     "print(ids)"
    ]
@@ -572,7 +563,7 @@
    "source": [
     "tokenizer = SimpleTokenizerV1(vocab)\n",
     "\n",
-    "text = \"Hello, do you like tea. Is this-- a test?\"\n",
+    "text = \"Hello, have you completed OCI Generative AI Professional Certification Course?\"\n",
     "\n",
     "tokenizer.encode(text)"
    ]
@@ -582,7 +573,7 @@
    "id": "dc53ee0c-fe2b-4cd8-a946-5471f7651acf",
    "metadata": {},
    "source": [
-    "- The above produces an error because the word \"Hello\" is not contained in the vocabulary\n",
+    "- The above produces an error because the word \"OCI\" is not contained in the vocabulary\n",
     "- To deal with such cases, we can add special tokens like `\"<|unk|>\"` to the vocabulary to represent unknown words\n",
     "- Since we are already extending the vocabulary, let's add another token called `\"<|endoftext|>\"` which is used in GPT-2 training to denote the end of a text (and it's also used between concatenated text, like if our training datasets consists of multiple articles, books, etc.)"
    ]
@@ -676,8 +667,8 @@
    "source": [
     "tokenizer = SimpleTokenizerV2(vocab)\n",
     "\n",
-    "text1 = \"Hello, do you like tea?\"\n",
-    "text2 = \"In the sunlit terraces of the palace.\"\n",
+    "text1 = \"Hello, have you completed OCI Generative AI\"\n",
+    "text2 = \"Professional Certification Course?\"\n",
     "\n",
     "text = \" <|endoftext|> \".join((text1, text2))\n",
     "\n",
@@ -766,8 +757,8 @@
    "outputs": [],
    "source": [
     "text = (\n",
-    "    \"Hello, do you like tea? <|endoftext|> In the sunlit terraces\"\n",
-    "     \"of someunknownPlace.\"\n",
+    "    \"Hello, have you completed <|unk|> <|unk|> <|unk|> <|endoftext|> <|unk|> <|unk|> <|unk|>?\"\n",
+    "     \"Professional Certification Course?\"\n",
     ")\n",
     "\n",
     "integers = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
@@ -1034,7 +1025,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
+    "with open(\"wizard_of_oz.txt\", \"r\", encoding=\"utf-8\") as f:\n",
     "    raw_text = f.read()"
    ]
   },