b4rtaz
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile
Lines changed: 34 additions & 7 deletions b/‎Makefile
Lines changed: 34 additions & 7 deletions
diff --git a/‎src/app.cpp
Lines changed: 23 additions & 6 deletions b/‎src/app.cpp
Lines changed: 23 additions & 6 deletions
diff --git a/‎src/app.hpp
Lines changed: 1 addition & 0 deletions b/‎src/app.hpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/dllama.cpp
Lines changed: 2 additions & 2 deletions b/‎src/dllama.cpp
Lines changed: 2 additions & 2 deletions
@@ -15,4 +15,5 @@ run*.sh
 server
 /dllama
 /dllama-*
-*.exe
+*.exe
+*.spv
@@ -1,5 +1,5 @@
 CXX = g++
-CXXFLAGS = -std=c++11 -Werror -Wformat -Werror=format-security
+CXXFLAGS = -std=c++11 -Werror -Wformat -Werror=format-security 
 
 ifndef TERMUX_VERSION
 	CXXFLAGS += -march=native -mtune=native
@@ -15,11 +15,25 @@ ifdef WVLA
 	CXXFLAGS += -Wvla-extension
 endif
 
+ifdef DLLAMA_VULKAN
+	CGLSLC = glslc
+
+ifeq ($(OS),Windows_NT)
+	LIBS += -L$(VK_SDK_PATH)\lib -lvulkan-1
+	CXXFLAGS += -DDLLAMA_VULKAN -I$(VK_SDK_PATH)\include
+else
+	LIBS += -lvulkan
+	CXXFLAGS += -DDLLAMA_VULKAN
+endif
+
+	DEPS += nn-vulkan.o
+endif
+
 ifeq ($(OS),Windows_NT)
-    LIBS = -lws2_32
+    LIBS += -lws2_32
 	DELETE_CMD = del /f
 else
-    LIBS = -lpthread
+    LIBS += -lpthread
     DELETE_CMD = rm -fv
 endif
 
@@ -47,6 +61,19 @@ nn-cpu-test: src/nn/nn-cpu-test.cpp nn-quants.o nn-core.o nn-executor.o llamafil
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
 nn-cpu-ops-test: src/nn/nn-cpu-ops-test.cpp nn-quants.o nn-core.o nn-executor.o llamafile-sgemm.o nn-cpu.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
+nn-vulkan.o: src/nn/nn-vulkan.cpp
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+
+ifdef DLLAMA_VULKAN
+VULKAN_SHADER_SRCS := $(wildcard src/nn/vulkan/*.comp)
+VULKAN_SHADER_BINS := $(VULKAN_SHADER_SRCS:.comp=.spv)
+DEPS += $(VULKAN_SHADER_BINS)
+
+%.spv: %.comp
+	$(CGLSLC) -c $< -o $@
+nn-vulkan-test: src/nn/nn-vulkan-test.cpp nn-quants.o nn-core.o nn-executor.o nn-vulkan.o ${DEPS}
+	$(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
+endif
 
 # llm
 tokenizer.o: src/tokenizer.cpp
@@ -57,7 +84,7 @@ app.o: src/app.cpp
 	$(CXX) $(CXXFLAGS) -c $^ -o $@
 tokenizer-test: src/tokenizer-test.cpp nn-quants.o nn-core.o llamafile-sgemm.o nn-cpu-ops.o tokenizer.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
-dllama: src/dllama.cpp nn-quants.o nn-core.o nn-executor.o nn-network.o llamafile-sgemm.o nn-cpu-ops.o nn-cpu.o tokenizer.o llm.o app.o
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
-dllama-api: src/dllama-api.cpp nn-quants.o nn-core.o nn-executor.o nn-network.o llamafile-sgemm.o nn-cpu-ops.o nn-cpu.o tokenizer.o llm.o app.o
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LIBS)
+dllama: src/dllama.cpp nn-quants.o nn-core.o nn-executor.o nn-network.o llamafile-sgemm.o nn-cpu-ops.o nn-cpu.o tokenizer.o llm.o app.o ${DEPS}
+	$(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
+dllama-api: src/dllama-api.cpp nn-quants.o nn-core.o nn-executor.o nn-network.o llamafile-sgemm.o nn-cpu-ops.o nn-cpu.o tokenizer.o llm.o app.o ${DEPS}
+	$(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
@@ -2,6 +2,9 @@
 #include <cassert>
 #include <cstring>
 #include <stdexcept>
+#if defined(DLLAMA_VULKAN)
+    #include "nn/nn-vulkan.hpp"
+#endif
 
 static NnFloatType parseFloatType(char *val) {
     if (std::strcmp(val, "f32") == 0) return F_32;
@@ -38,6 +41,7 @@ AppCliArgs AppCliArgs::parse(int argc, char* *argv, bool requireMode) {
     args.seed = (unsigned long long)time(nullptr);
     args.chatTemplateType = TEMPLATE_UNKNOWN;
     args.maxSeqLen = 0;
+    args.gpuIndex = -1;
     int i = 1;
     if (requireMode && argc > 1) {
         args.mode = argv[1];
@@ -102,6 +106,8 @@ AppCliArgs AppCliArgs::parse(int argc, char* *argv, bool requireMode) {
             args.chatTemplateType = parseChatTemplateType(value);
         } else if (std::strcmp(name, "--max-seq-len") == 0) {
             args.maxSeqLen = (unsigned int)atoi(value);
+        } else if (std::strcmp(name, "--gpu-index") == 0) {
+            args.gpuIndex = atoi(value);
         } else {
             throw std::runtime_error("Unknown option: " + std::string(name));
         }
@@ -119,6 +125,17 @@ AppCliArgs::~AppCliArgs() {
         delete[] workerPorts;
 }
 
+static NnDevice *createDevice(AppCliArgs *args, NnNetConfig *netConfig, NnNodeConfig *nodeConfig, NnNetExecution *netExecution) {
+    if (args->gpuIndex >= 0) {
+#if defined(DLLAMA_VULKAN)
+        return new NnVulkanDevice(args->gpuIndex, netConfig, nodeConfig, netExecution);
+#else
+        throw std::runtime_error("This build does not support GPU");
+#endif
+    }
+    return new NnCpuDevice(netConfig, nodeConfig, netExecution);
+}
+
 RootLlmInference::RootLlmInference(LlmNet *net, NnDevice *device, NnNetExecution *execution, NnExecutor *executor, NnNetwork *network) {
     this->header = net->header;
     this->tokenPipe = (float *)execution->pipes[net->tokenPipeIndex];
@@ -152,7 +169,6 @@ void RootLlmInference::setToken(NnUint batchIndex, NnUint token) {
 void RootLlmInference::forward() {
     if (network != nullptr) 
         network->writeAll(&controlPacket, sizeof(LlmControlPacket));
-    device->syncPointers();
     executor->forward();
 }
 
@@ -226,13 +242,13 @@ void runInferenceApp(AppCliArgs *args, void (*handler)(AppInferenceContext *cont
         configWriter.writeToWorkers(&net.netConfig, net.nodeConfigs);
     }
 
-    NnCpuDevice cpu(&net.netConfig, rootNodeConfig, &execution);
-    NnExecutor executor(&net.netConfig, rootNodeConfig, &cpu, &execution, synchronizer.get(), args->benchmark);
+    std::unique_ptr<NnDevice> device(createDevice(args, &net.netConfig, rootNodeConfig, &execution));
+    NnExecutor executor(&net.netConfig, rootNodeConfig, device.get(), &execution, synchronizer.get(), args->benchmark);
 
     NnRootWeightLoader weightLoader(&executor, network, nNodes);
     loadLlmNetWeight(args->modelPath, &net, &weightLoader);
 
-    RootLlmInference inference(&net, &cpu, &execution, &executor, network);
+    RootLlmInference inference(&net, device.get(), &execution, &executor, network);
 
     if (network != nullptr) {
         network->resetStats();
@@ -268,9 +284,10 @@ void runWorkerApp(AppCliArgs *args) {
 
         NnNetExecution execution(args->nThreads, &netConfig);
 
+        std::unique_ptr<NnDevice> device(createDevice(args, &netConfig, &nodeConfig, &execution));
+
         NnNetworkNodeSynchronizer synchronizer(network, &execution, &netConfig, &nodeConfig);
-        NnCpuDevice cpu(&netConfig, &nodeConfig, &execution);
-        NnExecutor executor(&netConfig, &nodeConfig, &cpu, &execution, &synchronizer, false);
+        NnExecutor executor(&netConfig, &nodeConfig, device.get(), &execution, &synchronizer, false);
 
         NnWorkerWeightReader weightReader(&executor, network);
         weightReader.read();
 
@@ -29,6 +29,7 @@ class AppCliArgs {
     unsigned long long seed;
     ChatTemplateType chatTemplateType;
     NnUint maxSeqLen;
+    int gpuIndex;
 
     // worker
     NnUint port;
 
@@ -54,7 +54,7 @@ static void inference(AppInferenceContext *context) {
         if (context->network != nullptr)
             context->network->getStats(&sentBytes, &recvBytes);
 
-        NnUint evalTime = context->executor->getTotalTime(STEP_EXECUTE_OP) + context->executor->getTotalTime(STEP_SYNC_POINTERS);
+        NnUint evalTime = context->executor->getTotalTime(STEP_EXECUTE_OP);
         NnUint syncTime = context->executor->getTotalTime(STEP_SYNC_NODES);
         printf("🔷️ Eval%5u ms Sync%5u ms | Sent%6zu kB Recv%6zu kB | (%d tokens)\n",
             evalTime / 1000,
@@ -83,7 +83,7 @@ static void inference(AppInferenceContext *context) {
         if (context->network != nullptr)
             context->network->getStats(&sentBytes, &recvBytes);
 
-        NnUint predTime = context->executor->getTotalTime(STEP_EXECUTE_OP) + context->executor->getTotalTime(STEP_SYNC_POINTERS);
+        NnUint predTime = context->executor->getTotalTime(STEP_EXECUTE_OP);
         NnUint syncTime = context->executor->getTotalTime(STEP_SYNC_NODES);
         printf("🔶 Pred%5u ms Sync%5u ms | Sent%6zu kB Recv%6zu kB | %s\n",
             predTime / 1000,
-Original file line number
+Diff line change
 server
 /dllama
 /dllama-*
 -*.exe
 +*.exe
 +*.spv