Skip to content

Commit

Permalink
Minor Issues Fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
AllianceSoftech committed Dec 9, 2024
1 parent 136817a commit 66d0729
Show file tree
Hide file tree
Showing 6 changed files with 901 additions and 319 deletions.
35 changes: 13 additions & 22 deletions part_1/01_main-code/part_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,10 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "4d1305cf-12d5-46fe-a2c9-36fb71c5b3d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch version: 2.4.0\n",
"tiktoken version: 0.8.0\n"
]
}
],
"outputs": [],
"source": [
"from importlib.metadata import version\n",
"\n",
Expand Down Expand Up @@ -138,7 +129,7 @@
"import os\n",
"import urllib.request\n",
"\n",
"if not os.path.exists(\"the-verdict.txt\"):\n",
"if not os.path.exists(\"wizard_of_oz.txt\"):\n",
" url = (\"https://raw.githubusercontent.com/Sangwan70/Building-an-LLM-From-Scratch/refs/heads/main/part_1/01_main-code/wizard_of_oz.txt\")\n",
" file_path = \"wizard_of_oz.txt\"\n",
" urllib.request.urlretrieve(url, file_path)"
Expand All @@ -159,7 +150,7 @@
"metadata": {},
"outputs": [],
"source": [
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
"with open(\"wizard_of_oz.txt\", \"r\", encoding=\"utf-8\") as f:\n",
" raw_text = f.read()\n",
" \n",
"print(\"Total number of character:\", len(raw_text))\n",
Expand Down Expand Up @@ -462,8 +453,8 @@
"source": [
"tokenizer = SimpleTokenizerV1(vocab)\n",
"\n",
"text = \"\"\"\"It's the last he painted, you know,\" \n",
" Mrs. Gisburn said with pardonable pride.\"\"\"\n",
"text = \"\"\"Princess Ozma, whom I love as much as my readers do, is again introduced\n",
"in this story, and so are several of our old friends of Oz.\"\"\"\n",
"ids = tokenizer.encode(text)\n",
"print(ids)"
]
Expand Down Expand Up @@ -572,7 +563,7 @@
"source": [
"tokenizer = SimpleTokenizerV1(vocab)\n",
"\n",
"text = \"Hello, do you like tea. Is this-- a test?\"\n",
"text = \"Hello, have you completed OCI Generative AI Professional Certification Course?\"\n",
"\n",
"tokenizer.encode(text)"
]
Expand All @@ -582,7 +573,7 @@
"id": "dc53ee0c-fe2b-4cd8-a946-5471f7651acf",
"metadata": {},
"source": [
"- The above produces an error because the word \"Hello\" is not contained in the vocabulary\n",
"- The above produces an error because the word \"OCI\" is not contained in the vocabulary\n",
"- To deal with such cases, we can add special tokens like `\"<|unk|>\"` to the vocabulary to represent unknown words\n",
"- Since we are already extending the vocabulary, let's add another token called `\"<|endoftext|>\"` which is used in GPT-2 training to denote the end of a text (and it's also used between concatenated text, like if our training datasets consists of multiple articles, books, etc.)"
]
Expand Down Expand Up @@ -676,8 +667,8 @@
"source": [
"tokenizer = SimpleTokenizerV2(vocab)\n",
"\n",
"text1 = \"Hello, do you like tea?\"\n",
"text2 = \"In the sunlit terraces of the palace.\"\n",
"text1 = \"Hello, have you completed OCI Generative AI\"\n",
"text2 = \"Professional Certification Course?\"\n",
"\n",
"text = \" <|endoftext|> \".join((text1, text2))\n",
"\n",
Expand Down Expand Up @@ -766,8 +757,8 @@
"outputs": [],
"source": [
"text = (\n",
" \"Hello, do you like tea? <|endoftext|> In the sunlit terraces\"\n",
" \"of someunknownPlace.\"\n",
" \"Hello, have you completed <|unk|> <|unk|> <|unk|> <|endoftext|> <|unk|> <|unk|> <|unk|>?\"\n",
" \"Professional Certification Course?\"\n",
")\n",
"\n",
"integers = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
Expand Down Expand Up @@ -1034,7 +1025,7 @@
"metadata": {},
"outputs": [],
"source": [
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
"with open(\"wizard_of_oz.txt\", \"r\", encoding=\"utf-8\") as f:\n",
" raw_text = f.read()"
]
},
Expand Down
Loading

0 comments on commit 66d0729

Please sign in to comment.