feat: support r1 distill llama. (#161)

b4rtaz · web-flow · commit aec85b9f59ad · 2025-02-15T11:35:34.000+01:00
diff --git a/README.md b/README.md
@@ -4,7 +4,9 @@
 
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/b4rtaz/distributed-llama/.github%2Fworkflows%2Fmain.yml?style=flat-square)](https://github.com/b4rtaz/distributed-llama/actions) [![License: MIT](https://img.shields.io/github/license/mashape/apistatus.svg?style=flat-square)](/LICENSE) [![Support this project](https://img.shields.io/github/sponsors/b4rtaz?style=flat-square&label=support%20this%20project&color=green)](https://github.com/sponsors/b4rtaz) [![Discord](https://discordapp.com/api/guilds/1245814812353495070/widget.png?style=shield)](https://discord.com/widget?id=1245814812353495070&theme=dark)
 
-Tensor parallelism is all you need. Run LLMs on weak devices or make powerful devices even more powerful by distributing the workload and dividing the RAM usage. This project proves that it's possible split the workload of LLMs across multiple devices and achieve a significant speedup. Distributed Llama allows you to run huge LLMs in-house. The project uses TCP sockets to synchronize the state. You can easily configure your AI cluster by using a home router.
+Connect home devices into a powerful cluster to accelerate LLM inference. More devices mean faster performance, leveraging tensor parallelism and high-speed synchronization over Ethernet.
+
+Supports Linux, macOS, and Windows. Optimized for ARM and x86_64 AVX2 CPUs.
 
 **News**
 - 12 Feb 2025 - 🚧 Merged the [fundamental codebase refactor](https://github.com/b4rtaz/distributed-llama/releases/tag/v0.12.0)
@@ -16,36 +18,26 @@ Tensor parallelism is all you need. Run LLMs on weak devices or make powerful de
 
 Python 3 and C++ compiler required. The command will download the model and the tokenizer.
 
-| Model                       | Purpose   | Size     | Command                                       |
-| --------------------------- | --------- | -------- | --------------------------------------------- |
-| Llama 3.1 8B Instruct Q40   | Chat, API | 6.32 GB  | `python launch.py llama3_1_8b_instruct_q40`   |
-| Llama 3.1 405B Instruct Q40 | Chat, API | 238 GB   | `python launch.py llama3_1_405b_instruct_q40` |
-| Llama 3.2 1B Instruct Q40   | Chat, API | 1.7 GB   | `python launch.py llama3_2_1b_instruct_q40`   |
-| Llama 3.2 3B Instruct Q40   | Chat, API | 3.4 GB   | `python launch.py llama3_2_3b_instruct_q40`   |
-| Llama 3.3 70B Instruct Q40  | Chat, API | 40 GB    | `python launch.py llama3_3_70b_instruct_q40`  |
+| Model                             | Purpose   | Size     | Command                                              |
+| --------------------------------- | --------- | -------- | ---------------------------------------------------- |
+| Llama 3.1 8B Instruct Q40         | Chat, API | 6.32 GB  | `python launch.py llama3_1_8b_instruct_q40`          |
+| Llama 3.1 405B Instruct Q40.      | Chat, API | 238 GB   | `python launch.py llama3_1_405b_instruct_q40`.       |
+| Llama 3.2 1B Instruct Q40         | Chat, API | 1.7 GB   | `python launch.py llama3_2_1b_instruct_q40`          |
+| Llama 3.2 3B Instruct Q40         | Chat, API | 3.4 GB   | `python launch.py llama3_2_3b_instruct_q40`          |
+| Llama 3.3 70B Instruct Q40        | Chat, API | 40 GB    | `python launch.py llama3_3_70b_instruct_q40`         |
+| DeepSeek R1 Distill Llama 8B Q40  | Chat, API | 6.32 GB  | `python launch.py deepseek_r1_distill_llama_8b_q40`  |
 
 ### 🛠️ Convert Model Manually
 
-Supported architectures: Llama, Mixtral
+Supported architectures: Llama.
 
-* [How to Convert Llama 2, Llama 3, Llama 3.1](./docs/LLAMA.md)
+* [How to Convert Llama 3.1](./docs/LLAMA.md)
 * [How to Convert Hugging Face Model](./docs/HUGGINGFACE.md)
 
 ### 🚧 Known Limitations
 
 * You can run Distributed Llama only on 1, 2, 4... 2^n nodes.
 * The maximum number of nodes is equal to the number of KV heads in the model [#70](https://github.com/b4rtaz/distributed-llama/issues/70).
-* CPU support only, GPU support is planned, optimized for (weights format × buffer format):
-  * ARM CPUs
-    * ✅ F32 × F32
-    * ❌ F16 × F32
-    * ✅ Q40 × F32
-    * ✅ Q40 × Q80
-  * x86_64 AVX2 CPUs
-    * ✅ F32 × F32
-    * ❌ F16 × F32
-    * ✅ Q40 × F32
-    * ✅ Q40 × Q80
 
 ### 👷 Architecture
 
diff --git a/converter/convert-hf.py b/converter/convert-hf.py
@@ -7,7 +7,6 @@
 
 class ArchType:
     LLAMA = 0xABCD00
-    MIXTRAL = 0xABCD02
 
 def permute(tensor, nHeads: int, nKvHeads: int):
     if nHeads != nKvHeads:
@@ -128,7 +127,6 @@ def parseArchType(type: str):
     archType = {
         'llama': ArchType.LLAMA,
         'mistral': ArchType.LLAMA,
-        'mixtral': ArchType.MIXTRAL,
     }.get(type)
     if (archType is None):
         raise Exception(f'Unsupported arch type: {type}')
diff --git a/converter/convert-tokenizer-hf.py b/converter/convert-tokenizer-hf.py
@@ -2,12 +2,26 @@
 import json
 import os
 from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizerFast
 writer = __import__('tokenizer-writer')
 
 def openJson(path):
     with open(path, 'r', encoding='utf-8') as file:
         return json.load(file)
 
+def unicodeToBytes():
+    # https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/encoder.py#L9
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(cs, bs))
+
 class TokensResolver:
     def __init__(self, dirPath, tokenizerConfig):
         self.dirPath = dirPath
@@ -18,25 +32,28 @@ def __init__(self, dirPath, tokenizerConfig):
         self.scores = []
 
     def resolvePreTrainedTokenizerFast(self):
-        tokenizer = openJson(os.path.join(self.dirPath, 'tokenizer.json'))
-        assert(tokenizer['model']['type'] == 'BPE')
-
-        i = 0
-        for token in tokenizer['model']['vocab'].keys():
-            assert(tokenizer['model']['vocab'][token] == i)
-            self.tokens.append(token.encode('utf8'))
+        utb = unicodeToBytes()
+        tokenizer = PreTrainedTokenizerFast(tokenizer_file = os.path.join(self.dirPath, 'tokenizer.json'))
+        vocabLen = len(tokenizer.get_vocab())
+        for i in range(vocabLen):
+            tokenChars = list(tokenizer.convert_ids_to_tokens([i])[0])
+            tokenBytes = []
+            for chr in tokenChars:
+                if (chr in utb):
+                    tokenBytes.append(utb[chr])
+                else:
+                    tokenBytes += list(chr.encode('utf-8'))
+            self.tokens.append(bytes(tokenBytes))
             self.scores.append(-float(i))
-            i += 1
-        if ('added_tokens' in tokenizer):
-            for at in tokenizer['added_tokens']:
-                assert(at['id'] == i)
-                self.tokens.append(at['content'].encode('utf8'))
-                self.scores.append(-float(i))
-                if (at['content'] == self.tokenizerConfig['bos_token']):
-                    self.bosId = i
-                if (at['content'] == self.tokenizerConfig['eos_token']):
-                    self.eosId = i
-                i += 1
+
+        self.bosId = tokenizer.bos_token_id
+        self.eosId = tokenizer.eos_token_id
+        if (self.bosId is None or self.eosId is None):
+            config = openJson(os.path.join(self.dirPath, 'config.json'))
+            if (self.bosId is None):
+                self.bosId = config['bos_token_id']
+            if (self.eosId is None):
+                self.eosId = config['eos_token_id']
 
     def resolveLlamaTokenizer(self):
         modelPath = os.path.join(self.dirPath, 'tokenizer.model')
@@ -57,12 +74,13 @@ def resolveLlamaTokenizer(self):
 
     def resolve(self):
         cls = self.tokenizerConfig['tokenizer_class']
-        if (cls == 'PreTrainedTokenizerFast'):
+        if (cls == 'PreTrainedTokenizerFast' or cls == 'LlamaTokenizerFast'):
             return self.resolvePreTrainedTokenizerFast()
         if (cls == 'LlamaTokenizer'):
             return self.resolveLlamaTokenizer()
         raise Exception(f'Tokenizer {cls} is not supported')
 
+
 def printUsage():
     print('Usage: python convert-tokenizer-hf.py <tokenizerFolderPath> <name>')
     print()
@@ -82,6 +100,8 @@ def printUsage():
     resolver = TokensResolver(dirPath, tokenizerConfig)
     resolver.resolve()
 
+    if (resolver.bosId is None or resolver.eosId is None):
+        raise Exception('Cannot resolve bosId or eosId')
     print(f'bosId: {resolver.bosId} ({resolver.tokens[resolver.bosId]})')
     print(f'eosId: {resolver.eosId} ({resolver.tokens[resolver.eosId]})')
 
diff --git a/launch.py b/launch.py
@@ -38,6 +38,11 @@ def parts(length):
         'https://huggingface.co/b4rtaz/Llama-3_3-70B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama-3.3-70b.t?download=true',
         'q40', 'q80', 'chat', '--max-seq-len 4096'
     ],
+    'deepseek_r1_distill_llama_8b_q40': [
+        ['https://huggingface.co/b4rtaz/DeepSeek-R1-Distill-Llama-8B-Distributed-Llama/resolve/main/dllama_model_deepseek-r1-distill-llama-8b_q40.m?download=true'],
+        'https://huggingface.co/b4rtaz/DeepSeek-R1-Distill-Llama-8B-Distributed-Llama/resolve/main/dllama_tokenizer_deepseek-r1-distill-llama-8b.t?download=true',
+        'q40', 'q80', 'chat', '--max-seq-len 4096'
+    ],
 }
 
 def confirm(message: str):
diff --git a/src/app.cpp b/src/app.cpp
@@ -14,8 +14,7 @@ static NnFloatType parseFloatType(char *val) {
 static ChatTemplateType parseChatTemplateType(char *val) {
     if (std::strcmp(val, "llama2") == 0) return TEMPLATE_LLAMA2;
     if (std::strcmp(val, "llama3") == 0) return TEMPLATE_LLAMA3;
-    if (std::strcmp(val, "zephyr") == 0) return TEMPLATE_ZEPHYR;
-    if (std::strcmp(val, "chatml") == 0) return TEMPLATE_CHATML;
+    if (std::strcmp(val, "deepSeek3") == 0) return TEMPLATE_DEEP_SEEK3;
     throw std::runtime_error("Invalid chat template type: " + std::string(val));
 }
 
diff --git a/src/dllama-api.cpp b/src/dllama-api.cpp
@@ -345,14 +345,13 @@ class ApiServer {
             inputItems[i].message = deltaPrompt[i].content;
         }
 
-        std::string inputPrompt = chatTemplate->generate(nInputItems, inputItems, true);
-        printf("🔹%s🔸", inputPrompt.c_str());
+        GeneratedChat inputPrompt = chatTemplate->generate(nInputItems, inputItems, true);
+        printf("🔹%s🔸", inputPrompt.content);
 
-        size_t promptLength = inputPrompt.size();
         int nPromptTokens;
-        std::unique_ptr<int[]> promptTokensPtr(new int[promptLength + 2]);
+        std::unique_ptr<int[]> promptTokensPtr(new int[inputPrompt.length + 2]);
         int *promptTokens = promptTokensPtr.get();
-        tokenizer->encode((char*)inputPrompt.c_str(), promptTokens, &nPromptTokens, true, true);
+        tokenizer->encode((char*)inputPrompt.content, promptTokens, &nPromptTokens, true, true);
 
         pos_t promptEndPos = startPos + nPromptTokens - 1;
         if (promptEndPos > header->seqLen)
@@ -366,13 +365,16 @@ class ApiServer {
             naiveCache.push(NaiveCacheItem(promptEndPos, deltaPrompt[j]));
         }
 
-        if (params.stream) {
+        std::string buffer;
+
+        if (params.stream)
             request.writeStreamStartChunk();
+        if (inputPrompt.publicPrompt != nullptr) {
+            if (params.stream)
+                writeChatCompletionChunk(request, inputPrompt.publicPrompt, false);
+            buffer += inputPrompt.publicPrompt;
         }
 
-        std::string buffer;
-        size_t nStops = params.stop.size();
-
         NnSize pos = startPos;
         int token;
         for (NnSize i = 0; ;) {
@@ -400,8 +402,6 @@ class ApiServer {
         tokenizer->resetDecoder();
 
         for (; pos < maxPredPos;) {
-            int prevToken = token;
-
             inference->setPosition(pos);
             inference->setToken(0, token);
             inference->forward();
diff --git a/src/dllama.cpp b/src/dllama.cpp
@@ -142,12 +142,12 @@ static void chat(AppInferenceContext *context) {
 
         deltaItems.push_back(ChatItem{"user", prompt});
 
-        std::string inputPrompt = chatTemplate.generate(deltaItems.size(), deltaItems.data(), true);
-        std::unique_ptr<int[]> inputTokensPtr(new int[inputPrompt.size() + 2]);
+        GeneratedChat inputPrompt = chatTemplate.generate(deltaItems.size(), deltaItems.data(), true);
+        std::unique_ptr<int[]> inputTokensPtr(new int[inputPrompt.length + 2]);
         int *inputTokens = inputTokensPtr.get();
 
         bool addBos = pos == 0;
-        context->tokenizer->encode((char*)inputPrompt.c_str(), inputTokens, &nInputTokens, addBos, true);
+        context->tokenizer->encode((char*)inputPrompt.content, inputTokens, &nInputTokens, addBos, true);
 
         NnSize userPromptEndPos = (NnSize)std::min<unsigned int>(seqLen, pos + nInputTokens - 1);
         for (NnSize i = 0; ;) {
@@ -174,7 +174,9 @@ static void chat(AppInferenceContext *context) {
         context->tokenizer->resetDecoder();
 
         printf("\n🤖 Assistant\n");
-        std::string answer;
+        if (inputPrompt.publicPrompt != nullptr)
+            printf("%s", inputPrompt.publicPrompt);
+
         while (pos < seqLen) {
             context->inference->setPosition(pos);
             context->inference->setToken(0, token);
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
@@ -471,70 +471,84 @@ TokenizerChatStops::~TokenizerChatStops() {
     delete[] stops;
 }
 
-ChatTemplate::ChatTemplate(const ChatTemplateType type, const char* chatTemplate, const char* eos) {
+static const char *chatTemplateTypeToString(const ChatTemplateType type) {
+    if (type == TEMPLATE_LLAMA2) return "llama2";
+    if (type == TEMPLATE_LLAMA3) return "llama3";
+    if (type == TEMPLATE_DEEP_SEEK3) return "deepSeek3";
+    return "unknown";
+}
+
+ChatTemplate::ChatTemplate(const ChatTemplateType type, const char* chatTemplate, const char* eos)
+    : buffer() 
+{
     if (type == TEMPLATE_UNKNOWN) {
         if (chatTemplate == NULL)
             throw std::runtime_error("The tokenizer does not include chat template");
         if (strstr(chatTemplate, "[INST]") != NULL) {
             this->type = TEMPLATE_LLAMA2;
         } else if (strstr(chatTemplate, "<|start_header_id|>") != NULL) {
             this->type = TEMPLATE_LLAMA3;
-        } else if (strstr(chatTemplate, "<|user|>") != NULL) {
-            this->type = TEMPLATE_ZEPHYR;
-        } else if (strstr(chatTemplate, "<|im_start|>") != NULL) {
-            this->type = TEMPLATE_CHATML;
+        } else if (strstr(chatTemplate, "<｜Assistant｜>") != NULL) {
+            this->type = TEMPLATE_DEEP_SEEK3;
         } else {
-            throw new std::runtime_error("Not supported chat template");
+            throw std::runtime_error("Not supported chat template");
         }
     } else {
         this->type = type;
     }
     this->eos = eos;
 
-    printf("⭐ Chat template: ");
-    if (this->type == TEMPLATE_LLAMA2) {
-        printf("llama2\n");
-    } else if (this->type == TEMPLATE_LLAMA3) {
-        printf("llama3\n");
-    } else if (this->type == TEMPLATE_ZEPHYR) {
-        printf("zephyr\n");
-    } else if (this->type == TEMPLATE_CHATML) {
-        printf("chatml\n");
-    }
+    printf("⭐ Chat template: %s\n", chatTemplateTypeToString(this->type));
 }
 
-std::string ChatTemplate::generate(unsigned int nMessages, ChatItem* items, bool appendGenerationPrompt) {
-    std::ostringstream buffer;
+GeneratedChat ChatTemplate::generate(unsigned int nItems, ChatItem* items, bool appendGenerationPrompt) {
+    buffer.clear();
+
+    size_t publicPromptSize = 0;
+
     if (type == TEMPLATE_LLAMA2) {
         unsigned int i = 0;
-        if (nMessages >= 2 && items[0].role == "system" && items[1].role == "user") {
-            buffer << "[INST] <<SYS>>\n" << items[0].message << "\n<</SYS>>\n\n" << items[1].message << " [/INST]" << eos;
+        if (nItems >= 2 && items[0].role == "system" && items[1].role == "user") {
+            buffer += "[INST] <<SYS>>\n" + items[0].message + "\n<</SYS>>\n\n" + items[1].message + " [/INST]" + eos;
             i += 2;
         }
-        for (; i < nMessages; i++) {
+        for (; i < nItems; i++) {
             if (items[i].role == "assistant") {
-                buffer << items[i].message << eos;
+                buffer += items[i].message + eos;
             } else if (items[i].role == "user") {
-                buffer << "[INST] " << items[i].message << " [/INST]" << eos;
+                buffer += "[INST] " + items[i].message + " [/INST]" + eos;
             }
         }
     } else if (type == TEMPLATE_LLAMA3) {
-        for (unsigned int i = 0; i < nMessages; i++)
-            buffer << "<|start_header_id|>" << items[i].role << "<|end_header_id|>\n\n" << items[i].message << eos;
-        if (appendGenerationPrompt)
-            buffer << "<|start_header_id|>assistant<|end_header_id|>\n\n";
-    } else if (type == TEMPLATE_CHATML) {
-        for (unsigned int i = 0; i < nMessages; i++)
-            buffer << "<|im_start|>" << items[i].role << "\n" << items[i].message << "<|im_end|>\n";
-        if (appendGenerationPrompt)
-            buffer << "<|im_start|>assistant\n";
-    } else if (type == TEMPLATE_ZEPHYR) {
-        for (unsigned int i = 0; i < nMessages; i++)
-            buffer << "<|" << items[i].role << "|>\n" << items[i].message << eos << "\n";
+        for (unsigned int i = 0; i < nItems; i++)
+            buffer += "<|start_header_id|>" + items[i].role + "<|end_header_id|>\n\n" + items[i].message + eos;
         if (appendGenerationPrompt)
-            buffer << "<|assistant|>\n";
+            buffer += "<|start_header_id|>assistant<|end_header_id|>\n\n";
+    } else if (type == TEMPLATE_DEEP_SEEK3) {
+        unsigned int i = 0;
+        if (nItems > 0 && items[0].role == "system") {
+            buffer += items[0].message;
+            i++;
+        }
+        for (; i < nItems; i++) {
+            if (items[i].role == "user") {
+                buffer += "<｜User｜>" + items[i].message;
+            } else if (items[i].role == "assistant") {
+                buffer += "<｜Assistant｜>" + items[i].message;
+            }
+        }
+        if (appendGenerationPrompt) {
+            buffer += "<｜Assistant｜><think>\n";
+            publicPromptSize = 8; 
+        }
     }
-    return buffer.str();
+
+    const char *content = buffer.c_str();
+    size_t length = buffer.size();
+    const char *publicPrompt = publicPromptSize > 0
+        ? &content[length - publicPromptSize]
+        : nullptr;
+    return {content, length, publicPrompt};
 }
 
 EosDetector::EosDetector(size_t nTokens, const int *tokens, const char** pieces, int paddingLeft, int paddingRight) {
diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp

Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,7 @@ static NnFloatType parseFloatType(char *val) {`
`14`	`14`	`static ChatTemplateType parseChatTemplateType(char *val) {`
`15`	`15`	`if (std::strcmp(val, "llama2") == 0) return TEMPLATE_LLAMA2;`
`16`	`16`	`if (std::strcmp(val, "llama3") == 0) return TEMPLATE_LLAMA3;`
`17`		`- if (std::strcmp(val, "zephyr") == 0) return TEMPLATE_ZEPHYR;`
`18`		`- if (std::strcmp(val, "chatml") == 0) return TEMPLATE_CHATML;`
	`17`	`+ if (std::strcmp(val, "deepSeek3") == 0) return TEMPLATE_DEEP_SEEK3;`
`19`	`18`	`throw std::runtime_error("Invalid chat template type: " + std::string(val));`
`20`	`19`	`}`
`21`	`20`