fix: position calculation

b4rtaz · b4rtaz · commit b5283f78433a · 2025-02-13T10:12:27.000+01:00
diff --git a/src/dllama-api.cpp b/src/dllama-api.cpp
@@ -386,8 +386,8 @@ class ApiServer {
 
             inference->setBatchSize(batchSize);
             inference->setPosition(pos);
-            for (NnSize i = 0; i < batchSize; i++)
-                inference->setToken(i, promptTokens[i]);
+            for (NnSize j = 0; j < batchSize; j++)
+                inference->setToken(j, promptTokens[i + j]);
 
             inference->forward();
 
diff --git a/src/dllama.cpp b/src/dllama.cpp
@@ -152,7 +152,7 @@ static void chat(AppInferenceContext *context) {
         context->tokenizer->encode((char*)inputPrompt.c_str(), inputTokens, &nInputTokens, addBos, true);
 
         NnSize userPromptEndPos = (NnSize)std::min<unsigned int>(seqLen, pos + nInputTokens - 1);
-        for (;;) {
+        for (NnSize i = 0; ;) {
             int remainingTokens = userPromptEndPos - pos;
             if (remainingTokens <= 0)
                 break;
@@ -162,13 +162,14 @@ static void chat(AppInferenceContext *context) {
 
             context->inference->setBatchSize(batchSize);
             context->inference->setPosition(pos);
-            for (NnSize i = 0; i < batchSize; i++)
-                context->inference->setToken(i, inputTokens[i]);
+            for (NnSize j = 0; j < batchSize; j++)
+                context->inference->setToken(j, inputTokens[i + j]);
 
             context->inference->forward();
 
+            i += batchSize;
             pos += batchSize;
-            token = inputTokens[pos + 1];
+            token = inputTokens[i + 1];
         }
 
         context->inference->setBatchSize(1);