|
| 1 | +""" |
| 2 | +Prompt Strategy for finetuning Llama2 chat models |
| 3 | +see also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation. |
| 4 | +
|
| 5 | +This implementation is based on the Vicuna PR and the fastchat repo, see also: |
| 6 | +https://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847 |
| 7 | +
|
| 8 | +Use dataset type: "llama2_chat" in conig.yml to use this prompt style. |
| 9 | +
|
| 10 | +E.g. in the config.yml: |
| 11 | +``` |
| 12 | +datasets: |
| 13 | + - path: llama_finetune_train.jsonl |
| 14 | + type: llama2_chat |
| 15 | +``` |
| 16 | +
|
| 17 | +The dataset itself should look like this: |
| 18 | +``` |
| 19 | +{'conversations':[{"from": "human", "value": "Who are you?"}, {"from": "gpt", "value": "I am Vicuna"},...]} |
| 20 | +``` |
| 21 | +in a jsonl file. The first message should be from the human, the second from gpt. |
| 22 | +For a custom system message, the first "from" can be "system" (followed by alternating "human" and "gpt" turns). |
| 23 | +
|
| 24 | +Important: Don't use "special_tokens:" in your config.yml if you are not sure what you are doing! |
| 25 | +""" |
| 26 | + |
| 27 | +import logging |
| 28 | +from dataclasses import dataclass, field |
| 29 | +from typing import Generator, List, Sequence |
| 30 | + |
| 31 | +from axolotl.prompt_tokenizers import PromptTokenizingStrategy |
| 32 | +from axolotl.prompters import IGNORE_TOKEN_ID |
| 33 | + |
| 34 | + |
| 35 | +@dataclass |
| 36 | +class Llama2ChatConversation: |
| 37 | + """A class that manages prompt templates and keeps all conversation history. |
| 38 | + copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py""" |
| 39 | + |
| 40 | + name: str = "llama2" |
| 41 | + # The system prompt |
| 42 | + system: str = ( |
| 43 | + "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. " |
| 44 | + "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. " |
| 45 | + "Please ensure that your responses are socially unbiased and positive in nature.\n\n" |
| 46 | + "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. " |
| 47 | + "If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n" |
| 48 | + ) |
| 49 | + roles: Sequence[str] = ("[INST]", "[/INST]") |
| 50 | + messages: List[List[str]] = field(default_factory=list) |
| 51 | + offset: int = 0 |
| 52 | + sep = " " |
| 53 | + sep2 = " </s><s>" |
| 54 | + stop_token_ids = [2] |
| 55 | + |
| 56 | + def get_prompt(self) -> str: |
| 57 | + """Get the prompt for generation.""" |
| 58 | + seps = [self.sep, self.sep2] |
| 59 | + ret = "" |
| 60 | + for i, (role, message) in enumerate(self.messages): |
| 61 | + if (i == len(self.messages) - 1) and (role == self.roles[0]): |
| 62 | + # last message is from user (due to length), |
| 63 | + # return prompt without it for training |
| 64 | + return ret |
| 65 | + if i == 0: |
| 66 | + ret += self.system + message.strip() |
| 67 | + else: |
| 68 | + ret += role + " " + message.strip() + seps[i % 2] |
| 69 | + return ret |
| 70 | + |
| 71 | + def append_message(self, role: str, message: str): |
| 72 | + """Append a new message.""" |
| 73 | + self.messages.append([role, message]) |
| 74 | + |
| 75 | + |
| 76 | +class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy): |
| 77 | + """ |
| 78 | + Tokenizing strategy for ShareGPT prompts. |
| 79 | + adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py |
| 80 | + """ |
| 81 | + |
| 82 | + def __init__(self, *args, **kwargs): |
| 83 | + super().__init__(*args, **kwargs) |
| 84 | + self.sequence_len = 4096 |
| 85 | + self.tokenizer.add_special_tokens({"pad_token": "<pad>"}) |
| 86 | + # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/added_tokens.json |
| 87 | + |
| 88 | + def tokenize_prompt(self, prompt): |
| 89 | + conv = next(self.prompter.build_prompt(prompt)) |
| 90 | + conversation_str = conv.get_prompt() |
| 91 | + |
| 92 | + # Tokenize conversations |
| 93 | + input_ids = self.tokenizer( |
| 94 | + conversation_str, |
| 95 | + return_tensors="pt", |
| 96 | + padding="max_length", |
| 97 | + max_length=self.sequence_len, |
| 98 | + truncation=True, |
| 99 | + ).input_ids[0] |
| 100 | + target = input_ids.clone() |
| 101 | + |
| 102 | + # Mask targets. Only compute loss on the assistant outputs. |
| 103 | + sep = conv.roles[1] |
| 104 | + |
| 105 | + total_len = int(target.ne(self.tokenizer.pad_token_id).sum()) |
| 106 | + |
| 107 | + turns = conversation_str.split(conv.sep2) |
| 108 | + cur_len = 1 |
| 109 | + target[:cur_len] = IGNORE_TOKEN_ID |
| 110 | + for turn in turns: |
| 111 | + if turn == "": |
| 112 | + break |
| 113 | + turn_len = len(self.tokenizer(turn).input_ids) |
| 114 | + |
| 115 | + parts = turn.split(sep) |
| 116 | + if len(parts) != 2: |
| 117 | + break |
| 118 | + parts[0] += sep |
| 119 | + # "-1" is hardcoded for the LLaMA tokenizer to make the offset correct. |
| 120 | + instruction_len = len(self.tokenizer(parts[0]).input_ids) - 1 |
| 121 | + |
| 122 | + # Ignore the user instructions |
| 123 | + target[cur_len - 1 : cur_len + instruction_len] = IGNORE_TOKEN_ID |
| 124 | + cur_len += turn_len + 2 # due to length of role token |
| 125 | + |
| 126 | + target[cur_len:] = IGNORE_TOKEN_ID |
| 127 | + |
| 128 | + if cur_len < self.sequence_len: |
| 129 | + if cur_len != total_len: |
| 130 | + target[:] = IGNORE_TOKEN_ID |
| 131 | + logging.warning( |
| 132 | + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." |
| 133 | + f" (ignored)" |
| 134 | + ) |
| 135 | + |
| 136 | + attention_mask = input_ids.ne(self.tokenizer.pad_token_id).tolist() |
| 137 | + input_ids = input_ids.tolist() |
| 138 | + target = target.tolist() |
| 139 | + # this is a fix for the tokenizer which tokenizes [ differently with eos tokens and |
| 140 | + # follows the original llama implementation |
| 141 | + for i in range(2, total_len - 2): |
| 142 | + if input_ids[i] == 29961: |
| 143 | + input_ids[i] = 518 |
| 144 | + if target[i] == 29961: |
| 145 | + target[i] = 518 |
| 146 | + return { |
| 147 | + "input_ids": input_ids, |
| 148 | + "labels": target, |
| 149 | + "attention_mask": attention_mask, |
| 150 | + } |
| 151 | + |
| 152 | + |
| 153 | +class Llama2ChatPrompter: # pylint: disable=too-few-public-methods |
| 154 | + """ |
| 155 | + A prompter that generates prompts for Llama2 models. |
| 156 | + """ |
| 157 | + |
| 158 | + system_prompt = ( |
| 159 | + "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. " |
| 160 | + "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. " |
| 161 | + "Please ensure that your responses are socially unbiased and positive in nature.\n\n" |
| 162 | + "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. " |
| 163 | + "If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n" |
| 164 | + ) |
| 165 | + |
| 166 | + def build_prompt(self, source) -> Generator[Llama2ChatConversation, None, None]: |
| 167 | + # see https://github.com/lm-sys/FastChat/blob/da0641e567cf93756b0978ab5a6b092e96f06240/fastchat/train/train.py#L78 |
| 168 | + source = source["conversations"] # fix data structure for datasets |
| 169 | + |
| 170 | + # if system prompt provided, use it |
| 171 | + if source[0]["from"] == "system": |
| 172 | + system = f"[INST] <<SYS>>\n{source[0]['value']}\n<</SYS>>\n\n" |
| 173 | + source = source[1:] |
| 174 | + else: |
| 175 | + system = self.system_prompt |
| 176 | + |
| 177 | + conv = Llama2ChatConversation(system=system) |
| 178 | + |
| 179 | + if len(source) < 2: |
| 180 | + # If there isn't a back and forth conversation, ignore it |
| 181 | + # also happens on the data splitting leaving empty conversations |
| 182 | + raise IndexError |
| 183 | + |
| 184 | + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} |
| 185 | + |
| 186 | + if roles[source[0]["from"]] != conv.roles[0]: |
| 187 | + # Skip the first one if it is not from human |
| 188 | + source = source[1:] |
| 189 | + |
| 190 | + conv.messages = [] # pylint: disable=R0801 |
| 191 | + for j, sentence in enumerate(source): |
| 192 | + role = roles[sentence["from"]] |
| 193 | + assert role == conv.roles[j % 2] |
| 194 | + if sentence["value"]: |
| 195 | + conv.append_message(role, sentence["value"]) |
| 196 | + yield conv |
| 197 | + |
| 198 | + |
| 199 | +def load(tokenizer, cfg) -> LLama2ChatTokenizingStrategy: |
| 200 | + return LLama2ChatTokenizingStrategy( |
| 201 | + Llama2ChatPrompter(), |
| 202 | + tokenizer, |
| 203 | + cfg.train_on_inputs, |
| 204 | + cfg.sequence_len, |
| 205 | + ) |
0 commit comments