Skip to content

Commit 9314286

Browse files
committed
a rough implementation of GGML_NUMA_MIRROR
Add `-DGGML_NUMA_MIRROR=ON` when configuring, e.g. ``` cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DGGML_NUMA_MIRROR=ON ``` How it works: 1. Use mmap to allocate model memory on numa node 0 with specific virtual address (e.g. 0x200000000000ULL) 2. Mmap another memory space on numa node 1 and copy the model to that memory location (e.g. 0x400000000000ULL) 3. Change `tensor->data` to `tensor->__data[N_NUMA_NODES]` 4. Add two helper functions: 1) `tensor_data` to retrieve data on current numa 2) `tensor_set_data` to store data inside the tensor and modify hundreds lines of code to the above two helpers 5. When storing into the tensor data, check whether the data is located in the range of specified virtual address in step 1 and 2. If so, put them accordingly to `__data[0]` or `__data[1]`. For example, if the address to store is `0x200000114514`, we can know it's between `0x200000000000` and `0x400000000000`, so we can store `0x200000114514` to `__data[0]`, and store `0x200000114514 + 0x200000000000` to `__data[1]` 6. Add a thread local variable to store node id of current thread 7. Use `tensor->__data[node_id]` when retrieving data This patch also persists the model in hugepages memory, so we can skip the process of loading models from disk to memory.
1 parent 2f5f09b commit 9314286

File tree

5 files changed

+217
-3
lines changed

5 files changed

+217
-3
lines changed

ggml/CMakeLists.txt

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,14 +320,39 @@ set(variable_set_statements
320320
set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
321321

322322
if (GGML_NUMA_MIRROR)
323+
find_library(NUMA_LIBRARY NAMES numa)
324+
if (!NUMA_LIBRARY)
325+
message(FATAL_ERROR "libnuma is not found")
326+
endif()
327+
message(STATUS "libnuma: ${NUMA_LIBRARY}")
328+
329+
if (NOT DEFINED GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET)
330+
set(GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET "0x200000000000ULL")
331+
endif()
332+
if (NOT DEFINED GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT)
333+
set(GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT "0x200000000000ULL")
334+
endif()
335+
if (NOT DEFINED GGML_MMAP_HUGEPAGESZ)
336+
set(GGML_MMAP_HUGEPAGESZ "1073741824ULL")
337+
endif()
338+
323339
message(STATUS
324340
"-----------------\n"
325-
"Enabling GGML_NUMA_MIRROR"
341+
"Enabling GGML_NUMA_MIRROR\n"
342+
"Hugepages must be reserved properly,\n"
343+
"and your program should have write access to /dev/hugepages\n"
344+
"GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET = ${GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET}\n"
345+
"GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT = ${GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT}\n"
346+
"GGML_MMAP_HUGEPAGESZ = ${GGML_MMAP_HUGEPAGESZ}")
326347
message(STATUS
327348
"-----------------")
328349

329350
foreach(lib "ggml" "ggml-base")
330351
target_compile_definitions(${lib} PUBLIC GGML_NUMA_MIRROR)
352+
target_compile_definitions(${lib} PUBLIC GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET=${GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET})
353+
target_compile_definitions(${lib} PUBLIC GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT=${GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT})
354+
target_compile_definitions(${lib} PUBLIC GGML_MMAP_HUGEPAGESZ=${GGML_MMAP_HUGEPAGESZ})
355+
target_link_libraries(${lib} PUBLIC ${NUMA_LIBRARY})
331356
endforeach()
332357
endif()
333358

ggml/include/ggml.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,9 @@
310310
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
311311
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
312312

313+
#define GGML_LIKELY (x) __builtin_expect(!!(x), 1)
314+
#define GGML_UNLIKELY(x) __builtin_expect(!!(x), 0)
315+
313316
#ifdef __cplusplus
314317
extern "C" {
315318
#endif
@@ -625,6 +628,8 @@ extern "C" {
625628
static inline void * tensor_data(const struct ggml_tensor * tensor) {
626629
#ifdef GGML_NUMA_MIRROR
627630
int n = ggml_current_numa_node;
631+
if (n == -1)
632+
n = 0;
628633
return tensor->__data[n];
629634
#else
630635
return tensor->data;
@@ -633,8 +638,24 @@ extern "C" {
633638

634639
static inline void tensor_set_data(struct ggml_tensor * tensor, void * data) {
635640
#ifdef GGML_NUMA_MIRROR
641+
if ((uint64_t)data >= \
642+
GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + \
643+
GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT && \
644+
(uint64_t)data < GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + \
645+
2 * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT) {
646+
data = (void*) ((uint64_t)data - GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT);
647+
}
636648
tensor->__data[0] = data;
637-
tensor->__data[1] = data;
649+
if ((uint64_t)data >= \
650+
GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET && \
651+
(uint64_t)data < \
652+
GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + \
653+
GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT) {
654+
tensor->__data[1] = (void*) ((uint64_t)data + \
655+
GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT);
656+
} else {
657+
tensor->__data[1] = data;
658+
}
638659
#else
639660
tensor->data = data;
640661
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
#include "ggml-threading.h"
1212
#include "ggml.h"
1313

14+
#ifdef GGML_NUMA_MIRROR
15+
#include <numa.h>
16+
#include <numaif.h>
17+
#endif
18+
1419
#if defined(_MSC_VER) || defined(__MINGW32__)
1520
#include <malloc.h> // using malloc.h with MSC/MINGW
1621
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -15180,6 +15185,11 @@ struct ggml_cplan ggml_graph_plan(
1518015185
return cplan;
1518115186
}
1518215187

15188+
#ifdef GGML_NUMA_MIRROR
15189+
static bool g_cpuset_isset = false;
15190+
static cpu_set_t g_cpuset;
15191+
#endif
15192+
1518315193
static thread_ret_t ggml_graph_compute_thread(void * data) {
1518415194
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
1518515195
struct ggml_threadpool * tp = state->threadpool;
@@ -15197,6 +15207,51 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1519715207
/*.threadpool=*/ tp,
1519815208
};
1519915209

15210+
#ifdef GGML_NUMA_MIRROR
15211+
if (GGML_UNLIKELY(ggml_current_numa_node == -1)) {
15212+
int thread_id = state->ith;
15213+
int total_threads = tp->n_threads_max;
15214+
15215+
ggml_current_numa_node = !!!(thread_id < (total_threads / 2));
15216+
15217+
struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes());
15218+
numa_bitmask_setbit(mask, ggml_current_numa_node);
15219+
numa_bind(mask);
15220+
15221+
bool cpumask[GGML_MAX_N_THREADS];
15222+
memset(cpumask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
15223+
for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
15224+
if (CPU_ISSET(i, &g_cpuset)) {
15225+
cpumask[i] = true;
15226+
}
15227+
}
15228+
15229+
int cpuid = -1;
15230+
bool local_mask[GGML_MAX_N_THREADS];
15231+
int iter = 0;
15232+
for (int j = 0; j < thread_id; ++j) {
15233+
ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
15234+
}
15235+
memset(local_mask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
15236+
ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
15237+
for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
15238+
if (local_mask[i]) {
15239+
cpuid = i;
15240+
break;
15241+
}
15242+
}
15243+
15244+
if (cpuid != -1) {
15245+
cpu_set_t cpuset;
15246+
CPU_ZERO(&cpuset);
15247+
CPU_SET(cpuid, &cpuset);
15248+
sched_setaffinity(gettid(), sizeof(cpuset), &cpuset);
15249+
}
15250+
15251+
GGML_LOG_INFO("thread_id = %02d, node = %d, cpuid = %02d\n", thread_id, ggml_current_numa_node, cpuid);
15252+
}
15253+
#endif // GGML_NUMA_MIRROR
15254+
1520015255
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
1520115256
struct ggml_tensor * node = cgraph->nodes[node_n];
1520215257

@@ -15464,6 +15519,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1546415519
threadpool->ec = GGML_STATUS_SUCCESS;
1546515520
}
1546615521

15522+
#ifdef GGML_NUMA_MIRROR
15523+
if (!g_cpuset_isset) {
15524+
CPU_ZERO(&g_cpuset);
15525+
sched_getaffinity(getpid(), sizeof(g_cpuset), &g_cpuset);
15526+
g_cpuset_isset = true;
15527+
}
15528+
#endif
15529+
1546715530
#ifdef GGML_USE_OPENMP
1546815531
if (n_threads > 1) {
1546915532
#pragma omp parallel num_threads(n_threads)

ggml/src/ggml.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
#endif
6262

6363
#ifdef GGML_NUMA_MIRROR
64-
__thread int ggml_current_numa_node = 0;
64+
__thread int ggml_current_numa_node = -1;
6565
#endif
6666

6767
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)

src/llama-mmap.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
#include <cerrno>
1111
#include <algorithm>
1212

13+
#ifdef GGML_NUMA_MIRROR
14+
#include <numa.h>
15+
#include <numaif.h>
16+
#endif
17+
1318
#ifdef __has_include
1419
#if __has_include(<unistd.h>)
1520
#include <unistd.h>
@@ -269,13 +274,24 @@ void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
269274

270275
// llama_mmap
271276

277+
#ifdef GGML_NUMA_MIRROR
278+
static uintptr_t base_address_offset = 0;
279+
static int file_name_offset = 0;
280+
#endif
281+
272282
struct llama_mmap::impl {
273283
#ifdef _POSIX_MAPPED_FILES
274284
std::vector<std::pair<size_t, size_t>> mapped_fragments;
275285

276286
impl(struct llama_file * file, size_t prefetch, bool numa) {
287+
#ifdef GGML_NUMA_MIRROR
288+
GGML_UNUSED(prefetch);
289+
GGML_UNUSED(numa);
290+
#endif
291+
277292
size = file->size();
278293
int fd = file->file_id();
294+
#ifndef GGML_NUMA_MIRROR
279295
int flags = MAP_SHARED;
280296
if (numa) { prefetch = 0; }
281297
#ifdef __linux__
@@ -285,6 +301,92 @@ struct llama_mmap::impl {
285301
}
286302
if (prefetch) { flags |= MAP_POPULATE; }
287303
#endif
304+
#endif // ifndef GGML_NUMA_MIRROR
305+
306+
#ifdef GGML_NUMA_MIRROR
307+
int oldpolicy;
308+
struct bitmask* oldmask = numa_allocate_nodemask();
309+
if (get_mempolicy(&oldpolicy, oldmask->maskp,
310+
oldmask->size + 1, 0, 0) < 0) {
311+
LLAMA_LOG_WARN("get_mempolicy failed, errno=%d %s\n", errno, strerror(errno));
312+
oldpolicy = MPOL_DEFAULT;
313+
}
314+
315+
size_t total_size = file->size();
316+
char path[128];
317+
bool is_new_mem[] = { false, false };
318+
int i;
319+
for (int node = 0; node < 2; ++node) {
320+
numa_set_preferred(node);
321+
LLAMA_LOG_INFO("numa_set_preferred(%d)\n", node);
322+
323+
for (i = 0; i * GGML_MMAP_HUGEPAGESZ < total_size; ++i) {
324+
sprintf(path, "/dev/hugepages/llama-node%d-%d", node, file_name_offset + i);
325+
if (!is_new_mem[node]) {
326+
is_new_mem[node] = access(path, F_OK) != 0;
327+
}
328+
int hugefd = open(path, O_CREAT | O_RDWR, 0600);
329+
if (hugefd < 0) {
330+
LLAMA_LOG_WARN("failed to open hugepage fd %s: %d %s\n",
331+
path, errno, strerror(errno));
332+
throw std::runtime_error(format("failed to open hugepage fd: %s", strerror(errno)));
333+
}
334+
uintptr_t address = GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET \
335+
+ node * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT + \
336+
base_address_offset + i * GGML_MMAP_HUGEPAGESZ;
337+
void* mm = mmap((void*)address, GGML_MMAP_HUGEPAGESZ, PROT_READ | PROT_WRITE,
338+
MAP_SHARED | MAP_HUGETLB | MAP_POPULATE,
339+
hugefd, 0);
340+
close(hugefd);
341+
LLAMA_LOG_INFO("mmap(%s) desire=%p size=%llu result=%p is_new_mem[%d]=%s\n",
342+
path, (void*)address, GGML_MMAP_HUGEPAGESZ, mm, node, is_new_mem[node] ? "yes" : "no");
343+
if (((uintptr_t)mm) != address) {
344+
LLAMA_LOG_WARN("unable to mmap memory: %d %s\n", errno, strerror(errno));
345+
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
346+
}
347+
if (is_new_mem[node]) {
348+
memset(mm, 0, GGML_MMAP_HUGEPAGESZ);
349+
}
350+
}
351+
if (node == 0) {
352+
addr = (void*)(GGML_MMAP_VIRTUAL_MEMORY_BASE_OFFSET + \
353+
node * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT + \
354+
base_address_offset);
355+
}
356+
}
357+
base_address_offset += i * GGML_MMAP_HUGEPAGESZ;
358+
file_name_offset += i;
359+
if (is_new_mem[0]) {
360+
LLAMA_LOG_INFO("begin to copy from disk to mem ...\n");
361+
size_t n = 0;
362+
while (n < total_size) {
363+
int nn = read(fd, (void*)((uintptr_t)addr + n), 1024 * 1024);
364+
if (nn < 0) {
365+
LLAMA_LOG_WARN("unable to read from file: %d %s\n", errno, strerror(errno));
366+
throw std::runtime_error(format("read failed: %s", strerror(errno)));
367+
}
368+
n += nn;
369+
}
370+
}
371+
for (int node = 1; node < 2; ++node) {
372+
if (is_new_mem[node]) {
373+
LLAMA_LOG_INFO("begin to copy from numa0 to numa%d ...\n", node);
374+
memcpy((void*)((uintptr_t)addr + \
375+
node * GGML_MMAP_VIRTUAL_MEMORY_NUMA_INCREMENT), \
376+
addr, total_size);
377+
}
378+
}
379+
380+
if (oldpolicy == MPOL_DEFAULT) {
381+
numa_set_localalloc();
382+
} else {
383+
set_mempolicy(oldpolicy, oldmask->maskp,
384+
oldmask->size + 1);
385+
}
386+
numa_free_cpumask(oldmask);
387+
#endif // GGML_NUMA_MIRROR
388+
389+
#ifndef GGML_NUMA_MIRROR
288390
addr = mmap(NULL, file->size(), PROT_READ, flags, fd, 0);
289391
if (addr == MAP_FAILED) {
290392
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
@@ -302,6 +404,7 @@ struct llama_mmap::impl {
302404
strerror(errno));
303405
}
304406
}
407+
#endif // ifndef GGML_NUMA_MIRROR
305408

306409
mapped_fragments.emplace_back(0, file->size());
307410
}
@@ -355,11 +458,13 @@ struct llama_mmap::impl {
355458
}
356459

357460
~impl() {
461+
#ifndef GGML_NUMA_MIRROR
358462
for (const auto & frag : mapped_fragments) {
359463
if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
360464
LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
361465
}
362466
}
467+
#endif
363468
}
364469
#elif defined(_WIN32)
365470
impl(struct llama_file * file, size_t prefetch, bool numa) {

0 commit comments

Comments
 (0)