Merge pull request #121 from lta155/main

三四章bug修改
datawhalechina · Jun 29, 2024 · 0ce94e8 · 0ce94e8
2 parents 2ddeb21 + 4e43c26
commit 0ce94e8
Show file tree

Hide file tree

Showing 10 changed files with 91 additions and 1,055 deletions.
diff --git a/data_base/vector_db/chroma/chroma.sqlite3 b/data_base/vector_db/chroma/chroma.sqlite3
diff --git a/...30-4892-b7ea-171b5afa9e88/data_level0.bin → ...e7-4f80-9453-356173b214de/data_level0.bin b/...30-4892-b7ea-171b5afa9e88/data_level0.bin → ...e7-4f80-9453-356173b214de/data_level0.bin
diff --git a/...f7-8630-4892-b7ea-171b5afa9e88/header.bin → ...59-dfe7-4f80-9453-356173b214de/header.bin b/...f7-8630-4892-b7ea-171b5afa9e88/header.bin → ...59-dfe7-4f80-9453-356173b214de/header.bin
diff --git a/...f7-8630-4892-b7ea-171b5afa9e88/length.bin → ...59-dfe7-4f80-9453-356173b214de/length.bin b/...f7-8630-4892-b7ea-171b5afa9e88/length.bin → ...59-dfe7-4f80-9453-356173b214de/length.bin
diff --git a/...630-4892-b7ea-171b5afa9e88/link_lists.bin → ...fe7-4f80-9453-356173b214de/link_lists.bin b/...630-4892-b7ea-171b5afa9e88/link_lists.bin → ...fe7-4f80-9453-356173b214de/link_lists.bin
diff --git a/notebook/C3 搭建知识库/2.使用 Embedding API.ipynb b/notebook/C3 搭建知识库/2.使用 Embedding API.ipynb
@@ -10,7 +10,7 @@
     "GPT有封装好的接口，我们简单封装即可。目前GPT embedding mode有三种，性能如下所示：\n",
     "|模型 | 每美元页数 | [MTEB](https://github.com/embeddings-benchmark/mteb)得分 | [MIRACL](https://github.com/project-miracl/miracl)得分|\n",
     "| --- | --- | --- | --- |\n",
-    "|text-embedding-3-large|9,615|54.9|64.6|\n",
+    "|text-embedding-3-large|9,615|64.6|54.9|\n",
     "|text-embedding-3-small|62,500|62.3|44.0|\n",
     "|text-embedding-ada-002|12,500|61.0|31.4|\n",
     "* MTEB得分为embedding model分类、聚类、配对等八个任务的平均得分。\n",

diff --git a/notebook/C3 搭建知识库/4.搭建并使用向量数据库.ipynb b/notebook/C3 搭建知识库/4.搭建并使用向量数据库.ipynb
@@ -18,7 +18,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['../../data_base/knowledge_db/prompt_engineering/6. 文本转换 Transforming.md', '../../data_base/knowledge_db/prompt_engineering/4. 文本概括 Summarizing.md', '../../data_base/knowledge_db/prompt_engineering/5. 推断 Inferring.md']\n"
+      "['../../data_base/knowledge_db/.DS_Store', '../../data_base/knowledge_db/prompt_engineering/6. 文本转换 Transforming.md', '../../data_base/knowledge_db/prompt_engineering/4. 文本概括 Summarizing.md']\n"
      ]
     }
    ],
@@ -525,7 +525,7 @@
     "from langchain.vectorstores.chroma import Chroma\n",
     "\n",
     "vectordb = Chroma.from_documents(\n",
-    "    documents=split_docs[:20], # 为了速度，只选择前 20 个切分的 doc 进行生成；使用千帆时因QPS限制，建议选择前 5 个doc\n",
+    "    documents=split_docs,\n",
     "    embedding=embedding,\n",
     "    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上\n",
     ")"
@@ -544,7 +544,16 @@
    "cell_type": "code",
    "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/lta/anaconda3/envs/llm_universe_2.x/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.\n",
+      "  warn_deprecated(\n"
+     ]
+    }
+   ],
    "source": [
     "vectordb.persist()"
    ]
@@ -558,7 +567,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "向量库中存储的数量：20\n"
+      "向量库中存储的数量：925\n"
      ]
     }
    ],
@@ -692,21 +701,15 @@
       "掌握调用大语言模型接口进行文本转换的技能，是开发各种语言类应用的重要一步。文\n",
       "--------------\n",
       "MMR 检索到的第1个内容: \n",
-      "\"This phrase is to cherck chatGPT for spelling abilitty\"  # spelling\n",
-      "]\n",
+      "与基础语言模型不同，指令微调 LLM 通过专门的训练，可以更好地理解并遵循指令。举个例子，当询问“法国的首都是什么？”时，这类模型很可能直接回答“法国的首都是巴黎”。指令微调 LLM 的训练通常基于预训练语言模型，先在大规模文本数据上进行预训练，掌握语言的基本规律。在此基础上进行进一步的训练与微调（finetune），输入是指令，输出是对这些指令的正确回复。有时还会采用RLHF（reinforce\n",
       "--------------\n",
       "MMR 检索到的第2个内容: \n",
-      "room.\n",
-      "\n",
-      "room. Yes, adults also like pandas\n",
-      "\n",
-      "too.\n",
-      "\n",
-      "too. She takes it everywhere with her, and it's super soft and\n",
-      "\n",
-      "cute.  One\n",
-      "\n",
-      "cute. However, one of the ears is a bit lower than the other, and I don't t\n",
+      "同一份数据集中的每个样本都含有相同个数的特征，假设此数据集中的每个样本都含有d 个特征，则第i\n",
+      "个样本的数学表示为d 维向量：xi = (xi1; xi2; ...; xid)，其中xij 表示样本xi 在第j 个属性上的取值。\n",
+      "模型：机器学习的一般流程如下：首先收集若干样本（假设此时有100 个）\n",
+      "，然后将其分为训练样本\n",
+      "（80 个）和测试样本（20 个）\n",
+      "，其中80 个训练样本构成的集合称为“\n",
       "--------------\n"
      ]
     }