)`."
- ]
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0e2bbfb7283745f2a93d8436c99e5e8b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Map: 0%| | 0/12 [00:00, ? examples/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
"# 准备数据\n",
- "# datapath = '../../data/alpaca_gpt4_data_dev.json' # dev数据只有少量数据,用于开发,实际训练时请使用full数据集\n",
- "datapath = '../../data/alpaca_gpt4_data.json' # full数据集\n",
+ "datapath = 'data/alpaca_gpt4_data_dev.json' # dev数据只有少量数据,用于开发,实际训练时请使用full数据集\n",
+ "# datapath = 'data/alpaca_gpt4_data.json' # full数据集\n",
"\n",
"# 定义tokenizer\n",
"from transformers import GPT2Tokenizer\n",
@@ -104,20 +148,426 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "epoch: 0, step: 0, loss: 8.52311897277832\n",
- "val step: 0, loss: 5.500304698944092\n",
- "epoch: 1, step: 0, loss: 5.841464042663574\n",
- "val step: 0, loss: 4.260898113250732\n",
- "epoch: 2, step: 0, loss: 4.249709129333496\n",
- "val step: 0, loss: 3.278724193572998\n"
+ "epoch: 0, step: 0, loss: 7.793501853942871\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+ "│ in <module>:39 │\n",
+ "│ │\n",
+ "│ 36 │ │ torch.save(model.state_dict(), f\"model_{epoch}.pt\") │\n",
+ "│ 37 │ │ val() │\n",
+ "│ 38 │\n",
+ "│ ❱ 39 train() │\n",
+ "│ 40 │\n",
+ "│ │\n",
+ "│ in train:30 │\n",
+ "│ │\n",
+ "│ 27 │ optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) │\n",
+ "│ 28 │ for epoch in range(3): │\n",
+ "│ 29 │ │ for step, batch in enumerate(dataloader): │\n",
+ "│ ❱ 30 │ │ │ loss = train_step(batch) │\n",
+ "│ 31 │ │ │ loss.backward() │\n",
+ "│ 32 │ │ │ optimizer.step() │\n",
+ "│ 33 │ │ │ optimizer.zero_grad() │\n",
+ "│ │\n",
+ "│ in train_step:13 │\n",
+ "│ │\n",
+ "│ 10 │ │ \"attention_mask\": batch[\"attention_mask\"], │\n",
+ "│ 11 │ │ \"labels\": batch[\"labels\"], │\n",
+ "│ 12 │ } │\n",
+ "│ ❱ 13 │ res = model(**kwargs)[\"loss\"] │\n",
+ "│ 14 │ return res │\n",
+ "│ 15 │\n",
+ "│ 16 def val(): │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1518 in _wrapped_call_impl │\n",
+ "│ │\n",
+ "│ 1515 │ │ if self._compiled_call_impl is not None: │\n",
+ "│ 1516 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] │\n",
+ "│ 1517 │ │ else: │\n",
+ "│ ❱ 1518 │ │ │ return self._call_impl(*args, **kwargs) │\n",
+ "│ 1519 │ │\n",
+ "│ 1520 │ def _call_impl(self, *args, **kwargs): │\n",
+ "│ 1521 │ │ forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.fo │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1527 in _call_impl │\n",
+ "│ │\n",
+ "│ 1524 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
+ "│ 1525 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
+ "│ 1526 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
+ "│ ❱ 1527 │ │ │ return forward_call(*args, **kwargs) │\n",
+ "│ 1528 │ │ │\n",
+ "│ 1529 │ │ try: │\n",
+ "│ 1530 │ │ │ result = None │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py:1075 in forward │\n",
+ "│ │\n",
+ "│ 1072 │ │ \"\"\" │\n",
+ "│ 1073 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │\n",
+ "│ 1074 │ │ │\n",
+ "│ ❱ 1075 │ │ transformer_outputs = self.transformer( │\n",
+ "│ 1076 │ │ │ input_ids, │\n",
+ "│ 1077 │ │ │ past_key_values=past_key_values, │\n",
+ "│ 1078 │ │ │ attention_mask=attention_mask, │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1518 in _wrapped_call_impl │\n",
+ "│ │\n",
+ "│ 1515 │ │ if self._compiled_call_impl is not None: │\n",
+ "│ 1516 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] │\n",
+ "│ 1517 │ │ else: │\n",
+ "│ ❱ 1518 │ │ │ return self._call_impl(*args, **kwargs) │\n",
+ "│ 1519 │ │\n",
+ "│ 1520 │ def _call_impl(self, *args, **kwargs): │\n",
+ "│ 1521 │ │ forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.fo │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1527 in _call_impl │\n",
+ "│ │\n",
+ "│ 1524 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
+ "│ 1525 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
+ "│ 1526 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
+ "│ ❱ 1527 │ │ │ return forward_call(*args, **kwargs) │\n",
+ "│ 1528 │ │ │\n",
+ "│ 1529 │ │ try: │\n",
+ "│ 1530 │ │ │ result = None │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py:899 in forward │\n",
+ "│ │\n",
+ "│ 896 │ │ │ │ │ encoder_attention_mask, │\n",
+ "│ 897 │ │ │ │ ) │\n",
+ "│ 898 │ │ │ else: │\n",
+ "│ ❱ 899 │ │ │ │ outputs = block( │\n",
+ "│ 900 │ │ │ │ │ hidden_states, │\n",
+ "│ 901 │ │ │ │ │ layer_past=layer_past, │\n",
+ "│ 902 │ │ │ │ │ attention_mask=attention_mask, │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1518 in _wrapped_call_impl │\n",
+ "│ │\n",
+ "│ 1515 │ │ if self._compiled_call_impl is not None: │\n",
+ "│ 1516 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] │\n",
+ "│ 1517 │ │ else: │\n",
+ "│ ❱ 1518 │ │ │ return self._call_impl(*args, **kwargs) │\n",
+ "│ 1519 │ │\n",
+ "│ 1520 │ def _call_impl(self, *args, **kwargs): │\n",
+ "│ 1521 │ │ forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.fo │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1527 in _call_impl │\n",
+ "│ │\n",
+ "│ 1524 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
+ "│ 1525 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
+ "│ 1526 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
+ "│ ❱ 1527 │ │ │ return forward_call(*args, **kwargs) │\n",
+ "│ 1528 │ │ │\n",
+ "│ 1529 │ │ try: │\n",
+ "│ 1530 │ │ │ result = None │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py:389 in forward │\n",
+ "│ │\n",
+ "│ 386 │ ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor │\n",
+ "│ 387 │ │ residual = hidden_states │\n",
+ "│ 388 │ │ hidden_states = self.ln_1(hidden_states) │\n",
+ "│ ❱ 389 │ │ attn_outputs = self.attn( │\n",
+ "│ 390 │ │ │ hidden_states, │\n",
+ "│ 391 │ │ │ layer_past=layer_past, │\n",
+ "│ 392 │ │ │ attention_mask=attention_mask, │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1518 in _wrapped_call_impl │\n",
+ "│ │\n",
+ "│ 1515 │ │ if self._compiled_call_impl is not None: │\n",
+ "│ 1516 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] │\n",
+ "│ 1517 │ │ else: │\n",
+ "│ ❱ 1518 │ │ │ return self._call_impl(*args, **kwargs) │\n",
+ "│ 1519 │ │\n",
+ "│ 1520 │ def _call_impl(self, *args, **kwargs): │\n",
+ "│ 1521 │ │ forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.fo │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1527 in _call_impl │\n",
+ "│ │\n",
+ "│ 1524 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
+ "│ 1525 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
+ "│ 1526 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
+ "│ ❱ 1527 │ │ │ return forward_call(*args, **kwargs) │\n",
+ "│ 1528 │ │ │\n",
+ "│ 1529 │ │ try: │\n",
+ "│ 1530 │ │ │ result = None │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py:330 in forward │\n",
+ "│ │\n",
+ "│ 327 │ │ if self.reorder_and_upcast_attn: │\n",
+ "│ 328 │ │ │ attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, valu │\n",
+ "│ 329 │ │ else: │\n",
+ "│ ❱ 330 │ │ │ attn_output, attn_weights = self._attn(query, key, value, attention_mask, he │\n",
+ "│ 331 │ │ │\n",
+ "│ 332 │ │ attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) │\n",
+ "│ 333 │ │ attn_output = self.c_proj(attn_output) │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py:211 in _attn │\n",
+ "│ │\n",
+ "│ 208 │ │ │\n",
+ "│ 209 │ │ # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op oth │\n",
+ "│ 210 │ │ attn_weights = attn_weights.type(value.dtype) │\n",
+ "│ ❱ 211 │ │ attn_weights = self.attn_dropout(attn_weights) │\n",
+ "│ 212 │ │ │\n",
+ "│ 213 │ │ # Mask heads if we want to │\n",
+ "│ 214 │ │ if head_mask is not None: │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1518 in _wrapped_call_impl │\n",
+ "│ │\n",
+ "│ 1515 │ │ if self._compiled_call_impl is not None: │\n",
+ "│ 1516 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] │\n",
+ "│ 1517 │ │ else: │\n",
+ "│ ❱ 1518 │ │ │ return self._call_impl(*args, **kwargs) │\n",
+ "│ 1519 │ │\n",
+ "│ 1520 │ def _call_impl(self, *args, **kwargs): │\n",
+ "│ 1521 │ │ forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.fo │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py:1527 in _call_impl │\n",
+ "│ │\n",
+ "│ 1524 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
+ "│ 1525 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
+ "│ 1526 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
+ "│ ❱ 1527 │ │ │ return forward_call(*args, **kwargs) │\n",
+ "│ 1528 │ │ │\n",
+ "│ 1529 │ │ try: │\n",
+ "│ 1530 │ │ │ result = None │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\dropout.py:58 in forward │\n",
+ "│ │\n",
+ "│ 55 │ \"\"\" │\n",
+ "│ 56 │ │\n",
+ "│ 57 │ def forward(self, input: Tensor) -> Tensor: │\n",
+ "│ ❱ 58 │ │ return F.dropout(input, self.p, self.training, self.inplace) │\n",
+ "│ 59 │\n",
+ "│ 60 │\n",
+ "│ 61 class Dropout1d(_DropoutNd): │\n",
+ "│ │\n",
+ "│ e:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\functional.py:1266 in dropout │\n",
+ "│ │\n",
+ "│ 1263 │ │ return handle_torch_function(dropout, (input,), input, p=p, training=training, i │\n",
+ "│ 1264 │ if p < 0.0 or p > 1.0: │\n",
+ "│ 1265 │ │ raise ValueError(f\"dropout probability has to be between 0 and 1, but got {p}\") │\n",
+ "│ ❱ 1266 │ return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, traini │\n",
+ "│ 1267 │\n",
+ "│ 1268 │\n",
+ "│ 1269 def alpha_dropout(input: Tensor, p: float = 0.5, training: bool = False, inplace: bool = │\n",
+ "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+ "KeyboardInterrupt\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+ "\u001b[31m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m39\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m36 \u001b[0m\u001b[2m│ │ \u001b[0mtorch.save(model.state_dict(), \u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mmodel_\u001b[0m\u001b[33m{\u001b[0mepoch\u001b[33m}\u001b[0m\u001b[33m.pt\u001b[0m\u001b[33m\"\u001b[0m) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m37 \u001b[0m\u001b[2m│ │ \u001b[0mval() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m38 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m39 train() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m40 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m in \u001b[92mtrain\u001b[0m:\u001b[94m30\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m27 \u001b[0m\u001b[2m│ \u001b[0moptimizer = torch.optim.Adam(model.parameters(), lr=\u001b[94m1e-5\u001b[0m) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m28 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mfor\u001b[0m epoch \u001b[95min\u001b[0m \u001b[96mrange\u001b[0m(\u001b[94m3\u001b[0m): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m29 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mfor\u001b[0m step, batch \u001b[95min\u001b[0m \u001b[96menumerate\u001b[0m(dataloader): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m30 \u001b[2m│ │ │ \u001b[0mloss = train_step(batch) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m31 \u001b[0m\u001b[2m│ │ │ \u001b[0mloss.backward() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m32 \u001b[0m\u001b[2m│ │ │ \u001b[0moptimizer.step() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m33 \u001b[0m\u001b[2m│ │ │ \u001b[0moptimizer.zero_grad() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m in \u001b[92mtrain_step\u001b[0m:\u001b[94m13\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m10 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33mattention_mask\u001b[0m\u001b[33m\"\u001b[0m: batch[\u001b[33m\"\u001b[0m\u001b[33mattention_mask\u001b[0m\u001b[33m\"\u001b[0m], \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m11 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33mlabels\u001b[0m\u001b[33m\"\u001b[0m: batch[\u001b[33m\"\u001b[0m\u001b[33mlabels\u001b[0m\u001b[33m\"\u001b[0m], \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m12 \u001b[0m\u001b[2m│ \u001b[0m} \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m13 \u001b[2m│ \u001b[0mres = model(**kwargs)[\u001b[33m\"\u001b[0m\u001b[33mloss\u001b[0m\u001b[33m\"\u001b[0m] \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m14 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mreturn\u001b[0m res \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m15 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m16 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mval\u001b[0m(): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1518\u001b[0m in \u001b[92m_wrapped_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1515 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1516 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl(*args, **kwargs) \u001b[2m# type: ignore[misc]\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1517 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1518 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._call_impl(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1519 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1520 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_call_impl\u001b[0m(\u001b[96mself\u001b[0m, *args, **kwargs): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1521 \u001b[0m\u001b[2m│ │ \u001b[0mforward_call = (\u001b[96mself\u001b[0m._slow_forward \u001b[94mif\u001b[0m torch._C._get_tracing_state() \u001b[94melse\u001b[0m \u001b[96mself\u001b[0m.fo \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1527\u001b[0m in \u001b[92m_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1524 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m (\u001b[96mself\u001b[0m._backward_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._backward_pre_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._forward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1525 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_backward_pre_hooks \u001b[95mor\u001b[0m _global_backward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1526 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_forward_hooks \u001b[95mor\u001b[0m _global_forward_pre_hooks): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1527 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m forward_call(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1528 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1529 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1530 \u001b[0m\u001b[2m│ │ │ \u001b[0mresult = \u001b[94mNone\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py\u001b[0m:\u001b[94m1075\u001b[0m in \u001b[92mforward\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1072 \u001b[0m\u001b[2;33m│ │ \u001b[0m\u001b[33m\"\"\"\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1073 \u001b[0m\u001b[2m│ │ \u001b[0mreturn_dict = return_dict \u001b[94mif\u001b[0m return_dict \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m \u001b[94melse\u001b[0m \u001b[96mself\u001b[0m.config.use_return \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1074 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1075 \u001b[2m│ │ \u001b[0mtransformer_outputs = \u001b[96mself\u001b[0m.transformer( \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1076 \u001b[0m\u001b[2m│ │ │ \u001b[0minput_ids, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1077 \u001b[0m\u001b[2m│ │ │ \u001b[0mpast_key_values=past_key_values, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1078 \u001b[0m\u001b[2m│ │ │ \u001b[0mattention_mask=attention_mask, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1518\u001b[0m in \u001b[92m_wrapped_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1515 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1516 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl(*args, **kwargs) \u001b[2m# type: ignore[misc]\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1517 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1518 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._call_impl(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1519 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1520 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_call_impl\u001b[0m(\u001b[96mself\u001b[0m, *args, **kwargs): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1521 \u001b[0m\u001b[2m│ │ \u001b[0mforward_call = (\u001b[96mself\u001b[0m._slow_forward \u001b[94mif\u001b[0m torch._C._get_tracing_state() \u001b[94melse\u001b[0m \u001b[96mself\u001b[0m.fo \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1527\u001b[0m in \u001b[92m_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1524 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m (\u001b[96mself\u001b[0m._backward_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._backward_pre_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._forward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1525 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_backward_pre_hooks \u001b[95mor\u001b[0m _global_backward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1526 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_forward_hooks \u001b[95mor\u001b[0m _global_forward_pre_hooks): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1527 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m forward_call(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1528 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1529 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1530 \u001b[0m\u001b[2m│ │ │ \u001b[0mresult = \u001b[94mNone\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py\u001b[0m:\u001b[94m899\u001b[0m in \u001b[92mforward\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 896 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mencoder_attention_mask, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 897 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 898 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 899 \u001b[2m│ │ │ │ \u001b[0moutputs = block( \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 900 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mhidden_states, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 901 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mlayer_past=layer_past, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 902 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mattention_mask=attention_mask, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1518\u001b[0m in \u001b[92m_wrapped_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1515 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1516 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl(*args, **kwargs) \u001b[2m# type: ignore[misc]\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1517 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1518 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._call_impl(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1519 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1520 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_call_impl\u001b[0m(\u001b[96mself\u001b[0m, *args, **kwargs): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1521 \u001b[0m\u001b[2m│ │ \u001b[0mforward_call = (\u001b[96mself\u001b[0m._slow_forward \u001b[94mif\u001b[0m torch._C._get_tracing_state() \u001b[94melse\u001b[0m \u001b[96mself\u001b[0m.fo \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1527\u001b[0m in \u001b[92m_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1524 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m (\u001b[96mself\u001b[0m._backward_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._backward_pre_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._forward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1525 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_backward_pre_hooks \u001b[95mor\u001b[0m _global_backward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1526 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_forward_hooks \u001b[95mor\u001b[0m _global_forward_pre_hooks): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1527 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m forward_call(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1528 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1529 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1530 \u001b[0m\u001b[2m│ │ │ \u001b[0mresult = \u001b[94mNone\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py\u001b[0m:\u001b[94m389\u001b[0m in \u001b[92mforward\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 386 \u001b[0m\u001b[2m│ \u001b[0m) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 387 \u001b[0m\u001b[2m│ │ \u001b[0mresidual = hidden_states \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 388 \u001b[0m\u001b[2m│ │ \u001b[0mhidden_states = \u001b[96mself\u001b[0m.ln_1(hidden_states) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 389 \u001b[2m│ │ \u001b[0mattn_outputs = \u001b[96mself\u001b[0m.attn( \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 390 \u001b[0m\u001b[2m│ │ │ \u001b[0mhidden_states, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 391 \u001b[0m\u001b[2m│ │ │ \u001b[0mlayer_past=layer_past, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 392 \u001b[0m\u001b[2m│ │ │ \u001b[0mattention_mask=attention_mask, \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1518\u001b[0m in \u001b[92m_wrapped_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1515 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1516 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl(*args, **kwargs) \u001b[2m# type: ignore[misc]\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1517 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1518 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._call_impl(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1519 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1520 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_call_impl\u001b[0m(\u001b[96mself\u001b[0m, *args, **kwargs): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1521 \u001b[0m\u001b[2m│ │ \u001b[0mforward_call = (\u001b[96mself\u001b[0m._slow_forward \u001b[94mif\u001b[0m torch._C._get_tracing_state() \u001b[94melse\u001b[0m \u001b[96mself\u001b[0m.fo \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1527\u001b[0m in \u001b[92m_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1524 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m (\u001b[96mself\u001b[0m._backward_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._backward_pre_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._forward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1525 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_backward_pre_hooks \u001b[95mor\u001b[0m _global_backward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1526 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_forward_hooks \u001b[95mor\u001b[0m _global_forward_pre_hooks): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1527 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m forward_call(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1528 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1529 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1530 \u001b[0m\u001b[2m│ │ │ \u001b[0mresult = \u001b[94mNone\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py\u001b[0m:\u001b[94m330\u001b[0m in \u001b[92mforward\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 327 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.reorder_and_upcast_attn: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 328 \u001b[0m\u001b[2m│ │ │ \u001b[0mattn_output, attn_weights = \u001b[96mself\u001b[0m._upcast_and_reordered_attn(query, key, valu \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 329 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 330 \u001b[2m│ │ │ \u001b[0mattn_output, attn_weights = \u001b[96mself\u001b[0m._attn(query, key, value, attention_mask, he \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 331 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 332 \u001b[0m\u001b[2m│ │ \u001b[0mattn_output = \u001b[96mself\u001b[0m._merge_heads(attn_output, \u001b[96mself\u001b[0m.num_heads, \u001b[96mself\u001b[0m.head_dim) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 333 \u001b[0m\u001b[2m│ │ \u001b[0mattn_output = \u001b[96mself\u001b[0m.c_proj(attn_output) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\transformers\\models\\gpt2\\modeling_gpt2.py\u001b[0m:\u001b[94m211\u001b[0m in \u001b[92m_attn\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 208 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 209 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op oth\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 210 \u001b[0m\u001b[2m│ │ \u001b[0mattn_weights = attn_weights.type(value.dtype) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 211 \u001b[2m│ │ \u001b[0mattn_weights = \u001b[96mself\u001b[0m.attn_dropout(attn_weights) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 212 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 213 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# Mask heads if we want to\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 214 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m head_mask \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1518\u001b[0m in \u001b[92m_wrapped_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1515 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1516 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._compiled_call_impl(*args, **kwargs) \u001b[2m# type: ignore[misc]\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1517 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1518 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._call_impl(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1519 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1520 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_call_impl\u001b[0m(\u001b[96mself\u001b[0m, *args, **kwargs): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1521 \u001b[0m\u001b[2m│ │ \u001b[0mforward_call = (\u001b[96mself\u001b[0m._slow_forward \u001b[94mif\u001b[0m torch._C._get_tracing_state() \u001b[94melse\u001b[0m \u001b[96mself\u001b[0m.fo \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m:\u001b[94m1527\u001b[0m in \u001b[92m_call_impl\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1524 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m (\u001b[96mself\u001b[0m._backward_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._backward_pre_hooks \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m._forward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1525 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_backward_pre_hooks \u001b[95mor\u001b[0m _global_backward_hooks \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1526 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[95mor\u001b[0m _global_forward_hooks \u001b[95mor\u001b[0m _global_forward_pre_hooks): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1527 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m forward_call(*args, **kwargs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1528 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1529 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1530 \u001b[0m\u001b[2m│ │ │ \u001b[0mresult = \u001b[94mNone\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\modules\\dropout.py\u001b[0m:\u001b[94m58\u001b[0m in \u001b[92mforward\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 55 \u001b[0m\u001b[2;33m│ \u001b[0m\u001b[33m\"\"\"\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 56 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 57 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mforward\u001b[0m(\u001b[96mself\u001b[0m, \u001b[96minput\u001b[0m: Tensor) -> Tensor: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 58 \u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m F.dropout(\u001b[96minput\u001b[0m, \u001b[96mself\u001b[0m.p, \u001b[96mself\u001b[0m.training, \u001b[96mself\u001b[0m.inplace) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 59 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 60 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 61 \u001b[0m\u001b[94mclass\u001b[0m \u001b[4;92mDropout1d\u001b[0m(_DropoutNd): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[33me:\\anaconda\\envs\\dl2\\lib\\site-packages\\torch\\nn\\functional.py\u001b[0m:\u001b[94m1266\u001b[0m in \u001b[92mdropout\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1263 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m handle_torch_function(dropout, (\u001b[96minput\u001b[0m,), \u001b[96minput\u001b[0m, p=p, training=training, i \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1264 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mif\u001b[0m p < \u001b[94m0.0\u001b[0m \u001b[95mor\u001b[0m p > \u001b[94m1.0\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1265 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mValueError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mdropout probability has to be between 0 and 1, but got \u001b[0m\u001b[33m{\u001b[0mp\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1266 \u001b[2m│ \u001b[0m\u001b[94mreturn\u001b[0m _VF.dropout_(\u001b[96minput\u001b[0m, p, training) \u001b[94mif\u001b[0m inplace \u001b[94melse\u001b[0m _VF.dropout(\u001b[96minput\u001b[0m, p, traini \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1267 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1268 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m1269 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92malpha_dropout\u001b[0m(\u001b[96minput\u001b[0m: Tensor, p: \u001b[96mfloat\u001b[0m = \u001b[94m0.5\u001b[0m, training: \u001b[96mbool\u001b[0m = \u001b[94mFalse\u001b[0m, inplace: \u001b[96mbool\u001b[0m = \u001b[31m│\u001b[0m\n",
+ "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+ "\u001b[1;91mKeyboardInterrupt\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
diff --git a/docs/chapter2/code/env.yml b/docs/chapter2/code/env.yml
new file mode 100644
index 0000000..26f5ba4
--- /dev/null
+++ b/docs/chapter2/code/env.yml
@@ -0,0 +1,180 @@
+name: distill
+channels:
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=main
+ - _openmp_mutex=5.1=1_gnu
+ - ca-certificates=2024.11.26=h06a4308_0
+ - ld_impl_linux-64=2.40=h12ee557_0
+ - libffi=3.4.4=h6a678d5_1
+ - libgcc-ng=11.2.0=h1234567_1
+ - libgomp=11.2.0=h1234567_1
+ - libstdcxx-ng=11.2.0=h1234567_1
+ - ncurses=6.4=h6a678d5_0
+ - openssl=3.0.15=h5eee18b_0
+ - pip=24.2=py39h06a4308_0
+ - python=3.9.20=he870216_1
+ - readline=8.2=h5eee18b_0
+ - setuptools=75.1.0=py39h06a4308_0
+ - sqlite=3.45.3=h5eee18b_0
+ - tk=8.6.14=h39e8969_0
+ - wheel=0.44.0=py39h06a4308_0
+ - xz=5.4.6=h5eee18b_1
+ - zlib=1.2.13=h5eee18b_1
+ - pip:
+ - accelerate==1.2.0
+ - aiohappyeyeballs==2.4.4
+ - aiohttp==3.11.11
+ - aiosignal==1.3.2
+ - annotated-types==0.7.0
+ - anyio==4.7.0
+ - argon2-cffi==23.1.0
+ - argon2-cffi-bindings==21.2.0
+ - arrow==1.3.0
+ - asttokens==3.0.0
+ - async-lru==2.0.4
+ - async-timeout==5.0.1
+ - attrs==24.2.0
+ - babel==2.16.0
+ - beautifulsoup4==4.12.3
+ - bleach==6.2.0
+ - certifi==2024.8.30
+ - cffi==1.17.1
+ - charset-normalizer==3.4.0
+ - click==8.1.7
+ - comm==0.2.2
+ - datasets==3.2.0
+ - debugpy==1.8.9
+ - decorator==5.1.1
+ - defusedxml==0.7.1
+ - dill==0.3.8
+ - docker-pycreds==0.4.0
+ - eval-type-backport==0.2.0
+ - exceptiongroup==1.2.2
+ - executing==2.1.0
+ - fastjsonschema==2.21.1
+ - filelock==3.16.1
+ - fqdn==1.5.1
+ - frozenlist==1.5.0
+ - fsspec==2024.9.0
+ - gitdb==4.0.11
+ - gitpython==3.1.43
+ - h11==0.14.0
+ - httpcore==1.0.7
+ - httpx==0.28.1
+ - huggingface-hub==0.26.5
+ - idna==3.10
+ - importlib-metadata==8.5.0
+ - ipykernel==6.29.5
+ - ipython==8.18.1
+ - ipywidgets==8.1.5
+ - isoduration==20.11.0
+ - jedi==0.19.2
+ - jinja2==3.1.4
+ - json5==0.10.0
+ - jsonpointer==3.0.0
+ - jsonschema==4.23.0
+ - jsonschema-specifications==2024.10.1
+ - jupyter==1.1.1
+ - jupyter-client==8.6.3
+ - jupyter-console==6.6.3
+ - jupyter-core==5.7.2
+ - jupyter-events==0.10.0
+ - jupyter-lsp==2.2.5
+ - jupyter-server==2.14.2
+ - jupyter-server-terminals==0.5.3
+ - jupyterlab==4.3.3
+ - jupyterlab-pygments==0.3.0
+ - jupyterlab-server==2.27.3
+ - jupyterlab-widgets==3.0.13
+ - markupsafe==3.0.2
+ - matplotlib-inline==0.1.7
+ - mistune==3.0.2
+ - mpmath==1.3.0
+ - multidict==6.1.0
+ - multiprocess==0.70.16
+ - nbclient==0.10.1
+ - nbconvert==7.16.4
+ - nbformat==5.10.4
+ - nest-asyncio==1.6.0
+ - networkx==3.2.1
+ - notebook==7.3.1
+ - notebook-shim==0.2.4
+ - numpy==2.0.2
+ - nvidia-cublas-cu12==12.4.5.8
+ - nvidia-cuda-cupti-cu12==12.4.127
+ - nvidia-cuda-nvrtc-cu12==12.4.127
+ - nvidia-cuda-runtime-cu12==12.4.127
+ - nvidia-cudnn-cu12==9.1.0.70
+ - nvidia-cufft-cu12==11.2.1.3
+ - nvidia-curand-cu12==10.3.5.147
+ - nvidia-cusolver-cu12==11.6.1.9
+ - nvidia-cusparse-cu12==12.3.1.170
+ - nvidia-nccl-cu12==2.21.5
+ - nvidia-nvjitlink-cu12==12.4.127
+ - nvidia-nvtx-cu12==12.4.127
+ - overrides==7.7.0
+ - packaging==24.2
+ - pandas==2.2.3
+ - pandocfilters==1.5.1
+ - parso==0.8.4
+ - pathlib==1.0.1
+ - pexpect==4.9.0
+ - platformdirs==4.3.6
+ - prometheus-client==0.21.1
+ - prompt-toolkit==3.0.48
+ - propcache==0.2.1
+ - protobuf==5.29.1
+ - psutil==6.1.0
+ - ptyprocess==0.7.0
+ - pure-eval==0.2.3
+ - pyarrow==18.1.0
+ - pycparser==2.22
+ - pydantic==2.10.3
+ - pydantic-core==2.27.1
+ - pygments==2.18.0
+ - python-dateutil==2.9.0.post0
+ - python-json-logger==3.2.0
+ - pytz==2024.2
+ - pyyaml==6.0.2
+ - pyzmq==26.2.0
+ - referencing==0.35.1
+ - regex==2024.11.6
+ - requests==2.32.3
+ - rfc3339-validator==0.1.4
+ - rfc3986-validator==0.1.1
+ - rpds-py==0.22.3
+ - safetensors==0.4.5
+ - send2trash==1.8.3
+ - sentry-sdk==2.19.2
+ - setproctitle==1.3.4
+ - six==1.17.0
+ - smmap==5.0.1
+ - sniffio==1.3.1
+ - soupsieve==2.6
+ - stack-data==0.6.3
+ - sympy==1.13.1
+ - terminado==0.18.1
+ - tinycss2==1.4.0
+ - tokenizers==0.13.3
+ - tomli==2.2.1
+ - torch==2.5.1
+ - tornado==6.4.2
+ - tqdm==4.67.1
+ - traitlets==5.14.3
+ - transformers==4.28.1
+ - triton==3.1.0
+ - types-python-dateutil==2.9.0.20241206
+ - typing-extensions==4.12.2
+ - tzdata==2024.2
+ - uri-template==1.3.0
+ - urllib3==2.2.3
+ - wcwidth==0.2.13
+ - webcolors==24.11.1
+ - webencodings==0.5.1
+ - websocket-client==1.8.0
+ - widgetsnbextension==4.0.13
+ - xxhash==3.5.0
+ - yarl==1.18.3
+ - zipp==3.21.0
+prefix: /home/PJLAB/gaoyufei/anaconda3/envs/babyllama
diff --git a/docs/chapter2/code/requirements.txt b/docs/chapter2/code/requirements.txt
deleted file mode 100644
index 0bc857d..0000000
--- a/docs/chapter2/code/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-datasets==2.14.5
-torch==2.1.2
\ No newline at end of file
diff --git a/docs/chapter2/models/models_download.sh b/docs/chapter2/models/models_download.sh
index ed3de8a..1bb56da 100644
--- a/docs/chapter2/models/models_download.sh
+++ b/docs/chapter2/models/models_download.sh
@@ -2,5 +2,4 @@
# GPT-2
-huggingface-cli download --resume-download openai-community/gpt2 --local-dir docs/chapter2/models/GPT-2
-
+huggingface-cli download --resume-download openai-community/gpt2 --local-dir docs/chapter2/models/GPT-2
\ No newline at end of file