From 1289c18faf9a68e988f90b868956df48ec981255 Mon Sep 17 00:00:00 2001
From: Officeyutong <yt.xyxx@gmail.com>
Date: Sat, 8 Jun 2024 16:04:00 +0800
Subject: [PATCH] add: ubpf jit (#7)

---
 .../workflows/run-execution-context-tests.yml |   4 +-
 .github/workflows/run-vm-tests.yml            |  37 +
 CMakeLists.txt                                |  16 +-
 execution-test/test_with_ebpf.cpp             |  20 +-
 include/libebpf.h                             |  20 +
 include/libebpf_insn.h                        |  17 +
 include/libebpf_vm.h                          |  13 +-
 src/jit/x86_64/helper_adaptor.c               |  11 +
 src/jit/x86_64/ubpf_jit_x86_64.c              | 872 ++++++++++++++++++
 src/jit/x86_64/ubpf_jit_x86_64.h              | 369 ++++++++
 src/libebpf.c                                 | 110 ++-
 src/libebpf_internal.h                        |  36 +-
 src/libebpf_vm.c                              |   2 +-
 vm-test/test-cases/err-stack-oob.data         |   2 +-
 vm-test/test_framework/test_jit.py            | 111 +++
 vm-test/test_runner/test_runner.c             |  38 +-
 16 files changed, 1638 insertions(+), 40 deletions(-)
 create mode 100644 .github/workflows/run-vm-tests.yml
 create mode 100644 src/jit/x86_64/helper_adaptor.c
 create mode 100644 src/jit/x86_64/ubpf_jit_x86_64.c
 create mode 100644 src/jit/x86_64/ubpf_jit_x86_64.h
 create mode 100644 vm-test/test_framework/test_jit.py

diff --git a/.github/workflows/run-execution-context-tests.yml b/.github/workflows/run-execution-context-tests.yml
index 2126d3f..85e89ce 100644
--- a/.github/workflows/run-execution-context-tests.yml
+++ b/.github/workflows/run-execution-context-tests.yml
@@ -5,11 +5,13 @@ on:
     branches: "master"
   pull_request: 
     branches: "master"
+  workflow_dispatch:
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 jobs:
-  build:
+  test:
     runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/run-vm-tests.yml b/.github/workflows/run-vm-tests.yml
new file mode 100644
index 0000000..d7c6721
--- /dev/null
+++ b/.github/workflows/run-vm-tests.yml
@@ -0,0 +1,37 @@
+name: Run ubpf vm tests
+
+on:
+  push:
+    branches: "master"
+  pull_request: 
+    branches: "master"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+jobs:
+  test:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'recursive'
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install cmake make clang-15 ninja-build
+    - name: Build targets
+      run: |
+        CC=clang-15 CXX=clang++-15 cmake -S. -Bbuild -DCMAKE_BUILD_TYPE:STRING=Release -G Ninja
+        CC=clang-15 CXX=clang++-15 cmake --build ./build --config Release --target libebpf_test_runner
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.8'
+    - run: python --version
+    - run: |
+        cd vm-test
+        python -m venv env
+        source ./env/bin/activate
+        pip install -r requirements.txt
+        pytest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d2e9f7..4a2e218 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,10 +8,7 @@ project(
   VERSION 0.1.0
   LANGUAGES C CXX
 )
-
-add_library(
-  libebpf_objects
-  OBJECT
+set(LIBEBPF_SOURCE_LIST   
   src/libebpf.c
   src/libebpf_vm.c
   src/libebpf_vm_verify.c
@@ -22,12 +19,21 @@ add_library(
   src/libebpf_execution_helpers.c
   src/libebpf_ffi_functions.c
   src/utils/hashmap.c
+  src/jit/x86_64/ubpf_jit_x86_64.c
+  src/jit/x86_64/helper_adaptor.c
+  )
+
+
+add_library(
+  libebpf_objects
+  OBJECT
+  ${LIBEBPF_SOURCE_LIST}
 )
 target_compile_definitions(libebpf_objects PRIVATE -D_GNU_SOURCE)
 target_include_directories(
   libebpf_objects
   INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/src
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/src 
 )
 
 set_target_properties(libebpf_objects PROPERTIES C_STANDARD 11 C_EXTENSIONS TRUE)
diff --git a/execution-test/test_with_ebpf.cpp b/execution-test/test_with_ebpf.cpp
index 2f34428..862767c 100644
--- a/execution-test/test_with_ebpf.cpp
+++ b/execution-test/test_with_ebpf.cpp
@@ -1,3 +1,4 @@
+#include "catch2/catch_message.hpp"
 #include "catch2/internal/catch_stdstreams.hpp"
 #include "libebpf.h"
 #include "libebpf_insn.h"
@@ -10,8 +11,7 @@
 #include <ostream>
 
 TEST_CASE("Test map operations with ebpf programs") {
-    std::unique_ptr<ebpf_state_t, decltype(&ebpf_state__destroy)> ctx(ebpf_state__create(),
-                                                                                              ebpf_state__destroy);
+    std::unique_ptr<ebpf_state_t, decltype(&ebpf_state__destroy)> ctx(ebpf_state__create(), ebpf_state__destroy);
     REQUIRE(ctx != nullptr);
     std::unique_ptr<ebpf_vm_t, decltype(&ebpf_vm_destroy)> vm(ebpf_vm_create(), ebpf_vm_destroy);
     REQUIRE(vm != nullptr);
@@ -75,3 +75,19 @@ TEST_CASE("Test map operations with ebpf programs") {
     REQUIRE(ebpf_state__map_elem_lookup(ctx.get(), hash_map_id, &key, &value) == 0);
     REQUIRE(value == 233 + 456);
 }
+
+TEST_CASE("Test execution with JIT") {
+    std::unique_ptr<ebpf_state_t, decltype(&ebpf_state__destroy)> ctx(ebpf_state__create(), ebpf_state__destroy);
+    REQUIRE(ctx != nullptr);
+    std::unique_ptr<ebpf_vm_t, decltype(&ebpf_vm_destroy)> vm(ebpf_vm_create(), ebpf_vm_destroy);
+    struct libebpf_insn insns[] = { // r1 += r2
+                                    BPF_RAW_INSN(BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_ADD, 1, 2, 0, 0),
+                                    // r0 = r1
+                                    BPF_RAW_INSN(BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_MOV_MOVSX, 0, 1, 0, 0)
+    };
+    REQUIRE(ebpf_vm_load_instructions(vm.get(), insns, std::size(insns)) == 0);
+    auto func = ebpf_vm_compile(vm.get());
+    INFO(ebpf_error_string());
+    REQUIRE(func);
+    REQUIRE(func((void*)100, (size_t)5000) == 5000 + 100);
+}
diff --git a/include/libebpf.h b/include/libebpf.h
index 153edff..6fd041f 100644
--- a/include/libebpf.h
+++ b/include/libebpf.h
@@ -10,6 +10,18 @@
 extern "C" {
 #endif
 
+/**
+ * @brief Function prototype for allocating a piece of executable memory and copy the given buffer onto it
+ *
+ */
+typedef void *(*ebpf_allocate_execuable_memory_and_copy)(void *buffer, size_t bufsize);
+
+/**
+ * @brief Function prototype for releasing a piece of executable memory
+ *
+ */
+typedef int (*ebpf_release_executable_memory)(void *mem, size_t len);
+
 /**
  * @brief Function prototype for the global custom memory allocator
  *
@@ -38,6 +50,14 @@ typedef void *(*ebpf_realloc)(void *, size_t);
  */
 void ebpf_set_global_memory_allocator(ebpf_malloc malloc, ebpf_free free, ebpf_realloc realloc);
 
+/**
+ * @brief Set functions which will be used for JIT
+ *
+ * @param allocate Function to allocate a piece of executable memory and copy buffer to it
+ * @param release Function to release a piece of executable memory
+ */
+void ebpf_set_executable_memory_allocator(ebpf_allocate_execuable_memory_and_copy *allocate, ebpf_release_executable_memory *release);
+
 /**
  * @brief Get the global error string
  *
diff --git a/include/libebpf_insn.h b/include/libebpf_insn.h
index 268c4ab..a1beccd 100644
--- a/include/libebpf_insn.h
+++ b/include/libebpf_insn.h
@@ -115,4 +115,21 @@ struct libebpf_insn {
 
 #define BPF_RAW_INSN_IMM64(SRC, DST, IMM1, IMM2) BPF_RAW_INSN(0x18, DST, SRC, 0, IMM1), BPF_RAW_INSN(0, 0, 0, 0, IMM2)
 
+enum bpf_register {
+    BPF_REG_0 = 0,
+    BPF_REG_1,
+    BPF_REG_2,
+    BPF_REG_3,
+    BPF_REG_4,
+    BPF_REG_5,
+    BPF_REG_6,
+    BPF_REG_7,
+    BPF_REG_8,
+    BPF_REG_9,
+    BPF_REG_10,
+    _BPF_REG_MAX,
+};
+
+
+
 #endif
diff --git a/include/libebpf_vm.h b/include/libebpf_vm.h
index e913c60..364de81 100644
--- a/include/libebpf_vm.h
+++ b/include/libebpf_vm.h
@@ -13,6 +13,7 @@ extern "C" {
 #define EBPF_STACK_SIZE ((size_t)512)
 #define MAX_LOCAL_FUNCTION_LEVEL 20
 
+#define LIBEBPF_MAX_INSTRUCTION_COUNT 65536
 
 
 /**
@@ -27,7 +28,7 @@ typedef struct ebpf_vm ebpf_vm_t;
  * @brief Function prototype for a jitted ebpf program
  *
  */
-typedef int (*ebpf_jit_fn)(void *mem, size_t mem_len, uint64_t *return_value);
+typedef uint64_t (*ebpf_jit_fn)(void *mem, size_t mem_len);
 
 /**
  * @brief Function prototype for external helper
@@ -143,6 +144,16 @@ int ebpf_vm_run(ebpf_vm_t *vm, void *mem, size_t mem_len, uint64_t *return_value
  */
 ebpf_jit_fn ebpf_vm_compile(ebpf_vm_t *vm);
 
+/**
+ * @brief Translate the loaded eBPF byte code to native code
+ * 
+ * @param vm The VM instance
+ * @param buffer buffer to output. ebpf_translate will allocate it. Use _libebpf_global_free to free it
+ * @param size size of the generated code, in bytes
+ * @return int 0 if succeeded
+ */
+int ebpf_translate(struct ebpf_vm *vm, uint8_t **buffer, size_t *size);
+
 
 #ifdef __cplusplus
 }
diff --git a/src/jit/x86_64/helper_adaptor.c b/src/jit/x86_64/helper_adaptor.c
new file mode 100644
index 0000000..e87430b
--- /dev/null
+++ b/src/jit/x86_64/helper_adaptor.c
@@ -0,0 +1,11 @@
+#include "libebpf_vm.h"
+#include <assert.h>
+#include <stdint.h>
+#include <libebpf_internal.h>
+uint64_t ebpf_ubpf_jit_dispatcher_adaptor(uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, unsigned int index,
+                                          ebpf_vm_t *vm) {
+    assert(index < MAX_EXTERNAL_HELPER);
+    struct ebpf_external_helper_definition *helper_def = &vm->helpers[index];
+    assert(helper_def->fn);
+    return helper_def->fn(arg1, arg2, arg3, arg4, arg5);
+}
diff --git a/src/jit/x86_64/ubpf_jit_x86_64.c b/src/jit/x86_64/ubpf_jit_x86_64.c
new file mode 100644
index 0000000..39f2c37
--- /dev/null
+++ b/src/jit/x86_64/ubpf_jit_x86_64.c
@@ -0,0 +1,872 @@
+// Copyright (c) 2015 Big Switch Networks, Inc
+// SPDX-License-Identifier: Apache-2.0
+
+/*
+ * Copyright 2015 Big Switch Networks, Inc
+ * Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "libebpf_internal.h"
+#include "libebpf_vm.h"
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <assert.h>
+#include "ubpf_jit_x86_64.h"
+#include <libebpf_insn.h>
+#if !defined(_countof)
+#define _countof(array) (sizeof(array) / sizeof(array[0]))
+#endif
+
+static void muldivmod(struct jit_state *state, uint8_t opcode, int src, int dst, int32_t imm);
+
+#define REGISTER_MAP_SIZE 11
+
+/*
+ * There are two common x86-64 calling conventions, as discussed at
+ * https://en.wikipedia.org/wiki/X86_calling_conventions#x86-64_calling_conventions
+ *
+ * Please Note: R12 is special and we are *not* using it. As a result, it is omitted
+ * from the list of non-volatile registers for both platforms (even though it is, in
+ * fact, non-volatile).
+ *
+ * BPF R0-R4 are "volatile"
+ * BPF R5-R10 are "non-volatile"
+ * In general, we attempt to map BPF volatile registers to x64 volatile and BPF non-
+ * volatile to x64 non-volatile.
+ */
+#define RCX_ALT R9
+static int platform_nonvolatile_registers[] = { RBP, RBX, R13, R14, R15 };
+static int platform_parameter_registers[] = { RDI, RSI, RDX, RCX, R8, R9 };
+static int register_map[REGISTER_MAP_SIZE] = {
+    RAX, RDI, RSI, RDX, R9, R8, RBX, R13, R14, R15, RBP,
+};
+
+/* Return the x86 register for the given eBPF register */
+static int map_register(int r) {
+    assert(r < _BPF_REG_MAX);
+    return register_map[r % _BPF_REG_MAX];
+}
+
+static inline void emit_local_call(struct jit_state *state, uint32_t target_pc) {
+    /*
+     * Pushing 4 * 8 = 32 bytes will maintain the invariant
+     * that the stack is 16-byte aligned.
+     */
+    emit_push(state, map_register(BPF_REG_6));
+    emit_push(state, map_register(BPF_REG_7));
+    emit_push(state, map_register(BPF_REG_8));
+    emit_push(state, map_register(BPF_REG_9));
+
+    emit1(state, 0xe8); // e8 is the opcode for a CALL
+    emit_jump_target_address(state, target_pc);
+
+    emit_pop(state, map_register(BPF_REG_9));
+    emit_pop(state, map_register(BPF_REG_8));
+    emit_pop(state, map_register(BPF_REG_7));
+    emit_pop(state, map_register(BPF_REG_6));
+}
+extern uint64_t ebpf_ubpf_jit_dispatcher_adaptor(uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, unsigned int index,
+                                                 ebpf_vm_t *vm);
+static uint32_t emit_dispatched_external_helper_address(struct jit_state *state, struct ebpf_vm *vm) {
+    uint32_t external_helper_address_target = state->offset;
+    emit8(state, (uint64_t)ebpf_ubpf_jit_dispatcher_adaptor);
+    return external_helper_address_target;
+}
+
+static uint32_t emit_retpoline(struct jit_state *state) {
+    /*
+     * Using retpolines to mitigate spectre/meltdown. Adapting the approach
+     * from
+     * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/retpoline-branch-target-injection-mitigation.html
+     */
+
+    /* label0: */
+    /* call label1 */
+    uint32_t retpoline_target = state->offset;
+    emit1(state, 0xe8);
+    uint32_t label1_call_offset = state->offset;
+    emit4(state, 0x00);
+
+    /* capture_ret_spec: */
+    /* pause */
+    uint32_t capture_ret_spec = state->offset;
+    emit1(state, 0xf3);
+    emit1(state, 0x90);
+    /* jmp  capture_ret_spec */
+    emit1(state, 0xe9);
+    emit_jump_target_offset(state, state->offset, capture_ret_spec);
+    emit4(state, 0x00);
+
+    /* label1: */
+    /* mov rax, (rsp) */
+    uint32_t label1 = state->offset;
+    emit1(state, 0x48);
+    emit1(state, 0x89);
+    emit1(state, 0x04);
+    emit1(state, 0x24);
+
+    /* ret */
+    emit1(state, 0xc3);
+
+    emit_jump_target_offset(state, label1_call_offset, label1);
+
+    return retpoline_target;
+}
+
+/* For testing, this changes the mapping between x86 and eBPF registers */
+void ubpf_set_register_offset(int x) {
+    int i;
+    if (x < REGISTER_MAP_SIZE) {
+        int tmp[REGISTER_MAP_SIZE];
+        memcpy(tmp, register_map, sizeof(register_map));
+        for (i = 0; i < REGISTER_MAP_SIZE; i++) {
+            register_map[i] = tmp[(i + x) % REGISTER_MAP_SIZE];
+        }
+    } else {
+        /* Shuffle array */
+        unsigned int seed = x;
+        for (i = 0; i < REGISTER_MAP_SIZE - 1; i++) {
+            int j = i + (rand_r(&seed) % (REGISTER_MAP_SIZE - i));
+            int tmp = register_map[j];
+            register_map[j] = register_map[i];
+            register_map[i] = tmp;
+        }
+    }
+}
+
+static int translate(struct ebpf_vm *vm, struct jit_state *state) {
+    int i;
+
+    /* Save platform non-volatile registers */
+    for (i = 0; i < _countof(platform_nonvolatile_registers); i++) {
+        emit_push(state, platform_nonvolatile_registers[i]);
+    }
+
+    /* Move first platform parameter register into register 1 */
+    if (map_register(1) != platform_parameter_registers[0]) {
+        emit_mov(state, platform_parameter_registers[0], map_register(BPF_REG_1));
+    }
+
+    /*
+     * Assuming that the stack is 16-byte aligned right before
+     * the call insn that brought us to this code, when
+     * we start executing the jit'd code, we need to regain a 16-byte
+     * alignment. The UBPF_STACK_SIZE is guaranteed to be
+     * divisible by 16. However, if we pushed an even number of
+     * registers on the stack when we are saving state (see above),
+     * then we have to add an additional 8 bytes to get back
+     * to a 16-byte alignment.
+     */
+    if (!(_countof(platform_nonvolatile_registers) % 2)) {
+        emit_alu64_imm32(state, 0x81, 5, RSP, 0x8);
+    }
+
+    /*
+     * Set BPF R10 (the way to access the frame in eBPF) to match RSP.
+     */
+    emit_mov(state, RSP, map_register(BPF_REG_10));
+
+    /* Allocate stack space */
+    emit_alu64_imm32(state, 0x81, 5, RSP, EBPF_STACK_SIZE);
+
+    /*
+     * Use a call to set up a place where we can land after eBPF program's
+     * final EXIT call. This makes it appear to the ebpf programs
+     * as if they are called like a function. It is their responsibility
+     * to deal with the non-16-byte aligned stack pointer that goes along
+     * with this pretense.
+     */
+    emit1(state, 0xe8);
+    emit4(state, 5);
+    /*
+     * We jump over this instruction in the first place; return here
+     * after the eBPF program is finished executing.
+     */
+    emit_jmp(state, TARGET_PC_EXIT);
+
+    for (i = 0; i < vm->insn_cnt; i++) {
+        struct libebpf_insn *inst = &vm->insns[i];
+        state->pc_locs[i] = state->offset;
+
+        int dst = map_register(inst->dst_reg);
+        int src = map_register(inst->src_reg);
+        uint32_t target_pc = i + inst->offset + 1;
+
+        if (i == 0 || vm->begin_of_local_function[i]) {
+            /* When we are the subject of a call, we have to properly align our
+             * stack pointer.
+             */
+            emit_alu64_imm32(state, 0x81, 5, RSP, 8);
+        }
+
+        switch (inst->code) {
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_ADD:
+            emit_alu32_imm32(state, 0x81, 0, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_ADD:
+            emit_alu32(state, 0x01, src, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_SUB:
+            emit_alu32_imm32(state, 0x81, 5, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_SUB:
+            emit_alu32(state, 0x29, src, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_MUL:
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_MUL:
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_DIV_SDIV:
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_DIV_SDIV:
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_MOD_SMOD:
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_MOD_SMOD:
+            muldivmod(state, inst->code, src, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_OR:
+            emit_alu32_imm32(state, 0x81, 1, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_OR:
+            emit_alu32(state, 0x09, src, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_AND:
+            emit_alu32_imm32(state, 0x81, 4, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_AND:
+            emit_alu32(state, 0x21, src, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_LSH:
+            emit_alu32_imm8(state, 0xc1, 4, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_LSH:
+            emit_mov(state, src, RCX);
+            emit_alu32(state, 0xd3, 4, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_RSH:
+            emit_alu32_imm8(state, 0xc1, 5, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_RSH:
+            emit_mov(state, src, RCX);
+            emit_alu32(state, 0xd3, 5, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_NEG:
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_NEG:
+            emit_alu32(state, 0xf7, 3, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_XOR:
+            emit_alu32_imm32(state, 0x81, 6, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_XOR:
+            emit_alu32(state, 0x31, src, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_MOV_MOVSX:
+            emit_alu32_imm32(state, 0xc7, 0, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_MOV_MOVSX:
+            emit_mov(state, src, dst);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_IMM | BPF_ALU_ARSH:
+            emit_alu32_imm8(state, 0xc1, 7, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU | BPF_SOURCE_REG | BPF_ALU_ARSH:
+            emit_mov(state, src, RCX);
+            emit_alu32(state, 0xd3, 7, dst);
+            break;
+
+        case BPF_CLASS_ALU | BPF_END_TO_LE | BPF_ALU_END:
+            /* No-op */
+            break;
+        case BPF_CLASS_ALU | BPF_END_TO_BE | BPF_ALU_END:
+            if (inst->imm == 16) {
+                /* rol */
+                emit1(state, 0x66); /* 16-bit override */
+                emit_alu32_imm8(state, 0xc1, 0, dst, 8);
+                /* and */
+                emit_alu32_imm32(state, 0x81, 4, dst, 0xffff);
+            } else if (inst->imm == 32 || inst->imm == 64) {
+                /* bswap */
+                emit_basic_rex(state, inst->imm == 64, 0, dst);
+                emit1(state, 0x0f);
+                emit1(state, 0xc8 | (dst & 7));
+            }
+            break;
+
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_ADD:
+            emit_alu64_imm32(state, 0x81, 0, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_ADD:
+            emit_alu64(state, 0x01, src, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_SUB:
+            emit_alu64_imm32(state, 0x81, 5, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_SUB:
+            emit_alu64(state, 0x29, src, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_MUL:
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_MUL:
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_DIV_SDIV:
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_DIV_SDIV:
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_MOD_SMOD:
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_MOD_SMOD:
+            muldivmod(state, inst->code, src, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_OR:
+            emit_alu64_imm32(state, 0x81, 1, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_OR:
+            emit_alu64(state, 0x09, src, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_AND:
+            emit_alu64_imm32(state, 0x81, 4, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_AND:
+            emit_alu64(state, 0x21, src, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_LSH:
+            emit_alu64_imm8(state, 0xc1, 4, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_LSH:
+            emit_mov(state, src, RCX);
+            emit_alu64(state, 0xd3, 4, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_RSH:
+            emit_alu64_imm8(state, 0xc1, 5, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_RSH:
+            emit_mov(state, src, RCX);
+            emit_alu64(state, 0xd3, 5, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_NEG:
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_NEG:
+            emit_alu64(state, 0xf7, 3, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_XOR:
+            emit_alu64_imm32(state, 0x81, 6, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_XOR:
+            emit_alu64(state, 0x31, src, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_MOV_MOVSX:
+            emit_load_imm(state, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_MOV_MOVSX:
+            emit_mov(state, src, dst);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_IMM | BPF_ALU_ARSH:
+            emit_alu64_imm8(state, 0xc1, 7, dst, inst->imm);
+            break;
+        case BPF_CLASS_ALU64 | BPF_SOURCE_REG | BPF_ALU_ARSH:
+            emit_mov(state, src, RCX);
+            emit_alu64(state, 0xd3, 7, dst);
+            break;
+
+        /* TODO use 8 bit immediate when possible */
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JA:
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JA:
+            emit_jmp(state, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JEQ:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x84, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JEQ:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x84, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JGT:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x87, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JGT:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x87, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JGE:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x83, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JGE:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x83, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JLT:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x82, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JLT:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x82, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JLE:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x86, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JLE:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x86, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JSET:
+            emit_alu64_imm32(state, 0xf7, 0, dst, inst->imm);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JSET:
+            emit_alu64(state, 0x85, src, dst);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JNE:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JNE:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JSGT:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8f, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JSGT:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x8f, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JSGE:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8d, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JSGE:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x8d, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JSLT:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8c, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JSLT:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x8c, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_JSLE:
+            emit_cmp_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8e, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_JSLE:
+            emit_cmp(state, src, dst);
+            emit_jcc(state, 0x8e, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JEQ:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x84, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JEQ:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x84, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JGT:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x87, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JGT:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x87, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JGE:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x83, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JGE:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x83, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JLT:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x82, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JLT:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x82, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JLE:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x86, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JLE:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x86, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JSET:
+            emit_alu32_imm32(state, 0xf7, 0, dst, inst->imm);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JSET:
+            emit_alu32(state, 0x85, src, dst);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JNE:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JNE:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x85, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JSGT:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8f, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JSGT:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x8f, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JSGE:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8d, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JSGE:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x8d, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JSLT:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8c, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JSLT:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x8c, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_JSLE:
+            emit_cmp32_imm32(state, dst, inst->imm);
+            emit_jcc(state, 0x8e, target_pc);
+            break;
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_JSLE:
+            emit_cmp32(state, src, dst);
+            emit_jcc(state, 0x8e, target_pc);
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_CALL:
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_CALL:
+        case BPF_CLASS_JMP32 | BPF_SOURCE_IMM | BPF_JMP_CALL:
+        case BPF_CLASS_JMP32 | BPF_SOURCE_REG | BPF_JMP_CALL:
+            /* We reserve RCX for shifts */
+            if (inst->src_reg == 0) {
+                emit_mov(state, RCX_ALT, RCX);
+                emit_dispatched_external_helper_call(state, vm, inst->imm);
+                // if (inst->imm == vm->unwind_stack_extension_index) {
+                //     emit_cmp_imm32(state, map_register(BPF_REG_0), 0);
+                //     emit_jcc(state, 0x84, TARGET_PC_EXIT);
+                // }
+            } else if (inst->src_reg == 1) {
+                target_pc = i + inst->imm + 1;
+                emit_local_call(state, target_pc);
+            }
+            break;
+        case BPF_CLASS_JMP | BPF_SOURCE_IMM | BPF_JMP_EXIT:
+        case BPF_CLASS_JMP | BPF_SOURCE_REG | BPF_JMP_EXIT:
+            /* On entry to every local function we add an additional 8 bytes.
+             * Undo that here!
+             */
+            emit_alu64_imm32(state, 0x81, 0, RSP, 8);
+            emit_ret(state);
+            break;
+
+        case BPF_CLASS_LDX | BPF_LS_SIZE_W | BPF_LS_MODE_MEM:
+            emit_load(state, S32, src, dst, inst->offset);
+            break;
+        case BPF_CLASS_LDX | BPF_LS_SIZE_H | BPF_LS_MODE_MEM:
+            emit_load(state, S16, src, dst, inst->offset);
+            break;
+        case BPF_CLASS_LDX | BPF_LS_SIZE_B | BPF_LS_MODE_MEM:
+            emit_load(state, S8, src, dst, inst->offset);
+            break;
+        case BPF_CLASS_LDX | BPF_LS_SIZE_DW | BPF_LS_MODE_MEM:
+            emit_load(state, S64, src, dst, inst->offset);
+            break;
+
+        case BPF_CLASS_ST | BPF_LS_SIZE_W | BPF_LS_MODE_MEM:
+            emit_store_imm32(state, S32, dst, inst->offset, inst->imm);
+            break;
+        case BPF_CLASS_ST | BPF_LS_SIZE_H | BPF_LS_MODE_MEM:
+            emit_store_imm32(state, S16, dst, inst->offset, inst->imm);
+            break;
+        case BPF_CLASS_ST | BPF_LS_SIZE_B | BPF_LS_MODE_MEM:
+            emit_store_imm32(state, S8, dst, inst->offset, inst->imm);
+            break;
+        case BPF_CLASS_ST | BPF_LS_SIZE_DW | BPF_LS_MODE_MEM:
+            emit_store_imm32(state, S64, dst, inst->offset, inst->imm);
+            break;
+
+        case BPF_CLASS_STX | BPF_LS_SIZE_W | BPF_LS_MODE_MEM:
+            emit_store(state, S32, src, dst, inst->offset);
+            break;
+        case BPF_CLASS_STX | BPF_LS_SIZE_H | BPF_LS_MODE_MEM:
+            emit_store(state, S16, src, dst, inst->offset);
+            break;
+        case BPF_CLASS_STX | BPF_LS_SIZE_B | BPF_LS_MODE_MEM:
+            emit_store(state, S8, src, dst, inst->offset);
+            break;
+        case BPF_CLASS_STX | BPF_LS_SIZE_DW | BPF_LS_MODE_MEM:
+            emit_store(state, S64, src, dst, inst->offset);
+            break;
+
+        case BPF_CLASS_LD | BPF_LS_SIZE_DW | BPF_LS_MODE_IMM: {
+            struct libebpf_insn *inst2 = &vm->insns[++i];
+            uint64_t imm = (uint32_t)inst->imm | ((uint64_t)inst2->imm << 32);
+            emit_load_imm(state, dst, imm);
+            break;
+        }
+
+        default:
+            ebpf_set_error_string("Unknown instruction at PC %d: opcode %02x", i, inst->code);
+            return -1;
+        }
+    }
+
+    /* Epilogue */
+    state->exit_loc = state->offset;
+
+    /* Move register 0 into rax */
+    if (map_register(BPF_REG_0) != RAX) {
+        emit_mov(state, map_register(BPF_REG_0), RAX);
+    }
+
+    /* Deallocate stack space by restoring RSP from BPF R10. */
+    emit_mov(state, map_register(BPF_REG_10), RSP);
+
+    if (!(_countof(platform_nonvolatile_registers) % 2)) {
+        emit_alu64_imm32(state, 0x81, 0, RSP, 0x8);
+    }
+
+    /* Restore platform non-volatile registers */
+    for (i = 0; i < _countof(platform_nonvolatile_registers); i++) {
+        emit_pop(state, platform_nonvolatile_registers[_countof(platform_nonvolatile_registers) - i - 1]);
+    }
+
+    emit1(state, 0xc3); /* ret */
+
+    state->retpoline_loc = emit_retpoline(state);
+    state->dispatcher_loc = emit_dispatched_external_helper_address(state, vm);
+
+    return 0;
+}
+
+static void muldivmod(struct jit_state *state, uint8_t opcode, int src, int dst, int32_t imm) {
+    bool mul = (opcode & BPF_ALU_CODE_MASK) == BPF_ALU_MUL;
+    bool div = (opcode & BPF_ALU_CODE_MASK) == BPF_ALU_DIV_SDIV;
+    bool mod = (opcode & BPF_ALU_CODE_MASK) == BPF_ALU_MOD_SMOD;
+    bool is64 = (opcode & BPF_ALU_CLASS_MASK) == BPF_CLASS_ALU64;
+    bool reg = (opcode & BPF_ALU_SOURCE_MASK) == BPF_SOURCE_REG;
+
+    // Short circuit for imm == 0.
+    if (!reg && imm == 0) {
+        if (div || mul) {
+            // For division and multiplication, set result to zero.
+            emit_alu32(state, 0x31, dst, dst);
+        } else {
+            // For modulo, set result to dividend.
+            emit_mov(state, dst, dst);
+        }
+        return;
+    }
+
+    if (dst != RAX) {
+        emit_push(state, RAX);
+    }
+
+    if (dst != RDX) {
+        emit_push(state, RDX);
+    }
+
+    // Load the divisor into RCX.
+    if (imm) {
+        emit_load_imm(state, RCX, imm);
+    } else {
+        emit_mov(state, src, RCX);
+    }
+
+    // Load the dividend into RAX.
+    emit_mov(state, dst, RAX);
+
+    // BPF has two different semantics for division and modulus. For division
+    // if the divisor is zero, the result is zero.  For modulus, if the divisor
+    // is zero, the result is the dividend. To handle this we set the divisor
+    // to 1 if it is zero and then set the result to zero if the divisor was
+    // zero (for division) or set the result to the dividend if the divisor was
+    // zero (for modulo).
+
+    if (div || mod) {
+        // Check if divisor is zero.
+        if (is64) {
+            emit_alu64(state, 0x85, RCX, RCX);
+        } else {
+            emit_alu32(state, 0x85, RCX, RCX);
+        }
+
+        // Save the dividend for the modulo case.
+        if (mod) {
+            emit_push(state, RAX); // Save dividend.
+        }
+
+        // Save the result of the test.
+        emit1(state, 0x9c); /* pushfq */
+
+        // Set the divisor to 1 if it is zero.
+        emit_load_imm(state, RDX, 1);
+        emit1(state, 0x48);
+        emit1(state, 0x0f);
+        emit1(state, 0x44);
+        emit1(state, 0xca); /* cmove rcx,rdx */
+
+        /* xor %edx,%edx */
+        emit_alu32(state, 0x31, RDX, RDX);
+    }
+
+    if (is64) {
+        emit_rex(state, 1, 0, 0, 0);
+    }
+
+    // Multiply or divide.
+    emit_alu32(state, 0xf7, mul ? 4 : 6, RCX);
+
+    // Division operation stores the remainder in RDX and the quotient in RAX.
+    if (div || mod) {
+        // Restore the result of the test.
+        emit1(state, 0x9d); /* popfq */
+
+        // If zero flag is set, then the divisor was zero.
+
+        if (div) {
+            // Set the dividend to zero if the divisor was zero.
+            emit_load_imm(state, RCX, 0);
+
+            // Store 0 in RAX if the divisor was zero.
+            // Use conditional move to avoid a branch.
+            emit1(state, 0x48);
+            emit1(state, 0x0f);
+            emit1(state, 0x44);
+            emit1(state, 0xc1); /* cmove rax,rcx */
+        } else {
+            // Restore dividend to RCX.
+            emit_pop(state, RCX);
+
+            // Store the dividend in RAX if the divisor was zero.
+            // Use conditional move to avoid a branch.
+            emit1(state, 0x48);
+            emit1(state, 0x0f);
+            emit1(state, 0x44);
+            emit1(state, 0xd1); /* cmove rdx,rcx */
+        }
+    }
+
+    if (dst != RDX) {
+        if (mod) {
+            emit_mov(state, RDX, dst);
+        }
+        emit_pop(state, RDX);
+    }
+    if (dst != RAX) {
+        if (div || mul) {
+            emit_mov(state, RAX, dst);
+        }
+        emit_pop(state, RAX);
+    }
+}
+
+static bool resolve_patchable_relatives(struct jit_state *state) {
+    int i;
+    for (i = 0; i < state->num_jumps; i++) {
+        struct patchable_relative jump = state->jumps[i];
+
+        int target_loc;
+        if (jump.target_offset != 0) {
+            target_loc = jump.target_offset;
+        } else if (jump.target_pc == TARGET_PC_EXIT) {
+            target_loc = state->exit_loc;
+        } else if (jump.target_pc == TARGET_PC_RETPOLINE) {
+            target_loc = state->retpoline_loc;
+        } else {
+            target_loc = state->pc_locs[jump.target_pc];
+        }
+
+        /* Assumes jump offset is at end of instruction */
+        uint32_t rel = target_loc - (jump.offset_loc + sizeof(uint32_t));
+
+        uint8_t *offset_ptr = &state->buf[jump.offset_loc];
+        memcpy(offset_ptr, &rel, sizeof(uint32_t));
+    }
+    for (i = 0; i < state->num_loads; i++) {
+        struct patchable_relative load = state->loads[i];
+
+        int target_loc;
+        if (load.target_pc == TARGET_PC_EXTERNAL_DISPATCHER) {
+            target_loc = state->dispatcher_loc;
+        } else {
+            target_loc = -1;
+            return false;
+        }
+
+        /* Assumes jump offset is at end of instruction */
+        uint32_t rel = target_loc - (load.offset_loc + sizeof(uint32_t));
+
+        uint8_t *offset_ptr = &state->buf[load.offset_loc];
+        memcpy(offset_ptr, &rel, sizeof(uint32_t));
+    }
+    return true;
+}
+
+int ebpf_translate(struct ebpf_vm *vm, uint8_t **buffer, size_t *size) {
+    struct jit_state state;
+    int result = -1;
+    state.offset = 0;
+    *size = state.size = LIBEBPF_MAX_INSTRUCTION_COUNT * 8;
+    state.buf = _libebpf_global_malloc(LIBEBPF_MAX_INSTRUCTION_COUNT * 8);
+    state.pc_locs = _libebpf_global_malloc((LIBEBPF_MAX_INSTRUCTION_COUNT + 1) * sizeof(state.pc_locs[0]));
+    state.jumps = _libebpf_global_malloc(LIBEBPF_MAX_INSTRUCTION_COUNT * sizeof(state.jumps[0]));
+    state.loads = _libebpf_global_malloc(LIBEBPF_MAX_INSTRUCTION_COUNT * sizeof(state.loads[0]));
+    state.num_jumps = 0;
+    state.num_loads = 0;
+    if (!state.pc_locs || !state.jumps) {
+        ebpf_set_error_string("Out of memory");
+        goto err;
+    }
+
+    if (translate(vm, &state) < 0) {
+        goto err;
+    }
+
+    if (state.num_jumps == LIBEBPF_MAX_INSTRUCTION_COUNT) {
+        ebpf_set_error_string("Excessive number of jump targets");
+        goto err;
+    }
+
+    if (state.offset == state.size) {
+        ebpf_set_error_string("Target buffer too small");
+        goto err;
+    }
+
+    if (!resolve_patchable_relatives(&state)) {
+        ebpf_set_error_string("Could not patch the relative addresses in the JIT'd code.");
+        goto err;
+    }
+
+    result = 0;
+    *size = state.offset;
+    *buffer = state.buf;
+    goto out;
+err:
+    _libebpf_global_free(state.buf);
+out:
+    _libebpf_global_free(state.pc_locs);
+    _libebpf_global_free(state.jumps);
+    _libebpf_global_free(state.loads);
+    return result;
+}
diff --git a/src/jit/x86_64/ubpf_jit_x86_64.h b/src/jit/x86_64/ubpf_jit_x86_64.h
new file mode 100644
index 0000000..6b1941a
--- /dev/null
+++ b/src/jit/x86_64/ubpf_jit_x86_64.h
@@ -0,0 +1,369 @@
+// Copyright (c) 2015 Big Switch Networks, Inc
+// SPDX-License-Identifier: Apache-2.0
+
+/*
+ * Copyright 2015 Big Switch Networks, Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Generic x86-64 code generation functions
+ */
+
+#ifndef UBPF_JIT_X86_64_H
+#define UBPF_JIT_X86_64_H
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <libebpf_vm.h>
+
+#define RAX 0
+#define RCX 1
+#define RDX 2
+#define RBX 3
+#define RSP 4
+#define RBP 5
+#define RIP 5
+#define RSI 6
+#define RDI 7
+#define R8 8
+#define R9 9
+#define R10 10
+#define R11 11
+#define R12 12
+#define R13 13
+#define R14 14
+#define R15 15
+
+enum operand_size {
+    S8,
+    S16,
+    S32,
+    S64,
+};
+
+struct patchable_relative {
+    uint32_t offset_loc;
+    uint32_t target_pc;
+    uint32_t target_offset;
+};
+
+/* Special values for target_pc in struct jump */
+#define TARGET_PC_EXIT -1
+#define TARGET_PC_RETPOLINE -3
+#define TARGET_PC_EXTERNAL_DISPATCHER -4
+
+struct jit_state {
+    uint8_t *buf;
+    uint32_t offset;
+    uint32_t size;
+    uint32_t *pc_locs;
+    uint32_t exit_loc;
+    uint32_t unwind_loc;
+    uint32_t retpoline_loc;
+    uint32_t dispatcher_loc;
+    struct patchable_relative *jumps;
+    struct patchable_relative *loads;
+    int num_jumps;
+    int num_loads;
+};
+
+static inline void emit_bytes(struct jit_state *state, void *data, uint32_t len) {
+    assert(state->offset <= state->size - len);
+    if ((state->offset + len) > state->size) {
+        state->offset = state->size;
+        return;
+    }
+    memcpy(state->buf + state->offset, data, len);
+    state->offset += len;
+}
+
+static inline void emit1(struct jit_state *state, uint8_t x) {
+    emit_bytes(state, &x, sizeof(x));
+}
+
+static inline void emit2(struct jit_state *state, uint16_t x) {
+    emit_bytes(state, &x, sizeof(x));
+}
+
+static inline void emit4(struct jit_state *state, uint32_t x) {
+    emit_bytes(state, &x, sizeof(x));
+}
+
+static inline void emit8(struct jit_state *state, uint64_t x) {
+    emit_bytes(state, &x, sizeof(x));
+}
+
+static inline void emit_jump_target_address(struct jit_state *state, int32_t target_pc) {
+    if (state->num_jumps == LIBEBPF_MAX_INSTRUCTION_COUNT) {
+        return;
+    }
+    struct patchable_relative *jump = &state->jumps[state->num_jumps++];
+    jump->offset_loc = state->offset;
+    jump->target_pc = target_pc;
+    emit4(state, 0);
+}
+
+static inline void emit_jump_target_offset(struct jit_state *state, uint32_t jump_loc, uint32_t jump_state_offset) {
+    if (state->num_jumps == LIBEBPF_MAX_INSTRUCTION_COUNT) {
+        return;
+    }
+    struct patchable_relative *jump = &state->jumps[state->num_jumps++];
+    jump->offset_loc = jump_loc;
+    jump->target_offset = jump_state_offset;
+}
+
+static inline void emit_modrm(struct jit_state *state, int mod, int r, int m) {
+    assert(!(mod & ~0xc0));
+    emit1(state, (mod & 0xc0) | ((r & 7) << 3) | (m & 7));
+}
+
+static inline void emit_modrm_reg2reg(struct jit_state *state, int r, int m) {
+    emit_modrm(state, 0xc0, r, m);
+}
+
+static inline void emit_modrm_and_displacement(struct jit_state *state, int r, int m, int32_t d) {
+    if (d == 0 && (m & 7) != RBP) {
+        emit_modrm(state, 0x00, r, m);
+    } else if (d >= -128 && d <= 127) {
+        emit_modrm(state, 0x40, r, m);
+        emit1(state, d);
+    } else {
+        emit_modrm(state, 0x80, r, m);
+        emit4(state, d);
+    }
+}
+
+static inline void emit_rex(struct jit_state *state, int w, int r, int x, int b) {
+    assert(!(w & ~1));
+    assert(!(r & ~1));
+    assert(!(x & ~1));
+    assert(!(b & ~1));
+    emit1(state, 0x40 | (w << 3) | (r << 2) | (x << 1) | b);
+}
+
+/*
+ * Emits a REX prefix with the top bit of src and dst.
+ * Skipped if no bits would be set.
+ */
+static inline void emit_basic_rex(struct jit_state *state, int w, int src, int dst) {
+    if (w || (src & 8) || (dst & 8)) {
+        emit_rex(state, w, !!(src & 8), 0, !!(dst & 8));
+    }
+}
+
+static inline void emit_push(struct jit_state *state, int r) {
+    emit_basic_rex(state, 0, 0, r);
+    emit1(state, 0x50 | (r & 7));
+}
+
+static inline void emit_pop(struct jit_state *state, int r) {
+    emit_basic_rex(state, 0, 0, r);
+    emit1(state, 0x58 | (r & 7));
+}
+
+/* REX prefix and ModRM byte */
+/* We use the MR encoding when there is a choice */
+/* 'src' is often used as an opcode extension */
+static inline void emit_alu32(struct jit_state *state, int op, int src, int dst) {
+    emit_basic_rex(state, 0, src, dst);
+    emit1(state, op);
+    emit_modrm_reg2reg(state, src, dst);
+}
+
+/* REX prefix, ModRM byte, and 32-bit immediate */
+static inline void emit_alu32_imm32(struct jit_state *state, int op, int src, int dst, int32_t imm) {
+    emit_alu32(state, op, src, dst);
+    emit4(state, imm);
+}
+
+/* REX prefix, ModRM byte, and 8-bit immediate */
+static inline void emit_alu32_imm8(struct jit_state *state, int op, int src, int dst, int8_t imm) {
+    emit_alu32(state, op, src, dst);
+    emit1(state, imm);
+}
+
+/* REX.W prefix and ModRM byte */
+/* We use the MR encoding when there is a choice */
+/* 'src' is often used as an opcode extension */
+static inline void emit_alu64(struct jit_state *state, int op, int src, int dst) {
+    emit_basic_rex(state, 1, src, dst);
+    emit1(state, op);
+    emit_modrm_reg2reg(state, src, dst);
+}
+
+/* REX.W prefix, ModRM byte, and 32-bit immediate */
+static inline void emit_alu64_imm32(struct jit_state *state, int op, int src, int dst, int32_t imm) {
+    emit_alu64(state, op, src, dst);
+    emit4(state, imm);
+}
+
+/* REX.W prefix, ModRM byte, and 8-bit immediate */
+static inline void emit_alu64_imm8(struct jit_state *state, int op, int src, int dst, int8_t imm) {
+    emit_alu64(state, op, src, dst);
+    emit1(state, imm);
+}
+
+/* Register to register mov */
+static inline void emit_mov(struct jit_state *state, int src, int dst) {
+    emit_alu64(state, 0x89, src, dst);
+}
+
+static inline void emit_cmp_imm32(struct jit_state *state, int dst, int32_t imm) {
+    emit_alu64_imm32(state, 0x81, 7, dst, imm);
+}
+
+static inline void emit_cmp32_imm32(struct jit_state *state, int dst, int32_t imm) {
+    emit_alu32_imm32(state, 0x81, 7, dst, imm);
+}
+
+static inline void emit_cmp(struct jit_state *state, int src, int dst) {
+    emit_alu64(state, 0x39, src, dst);
+}
+
+static inline void emit_cmp32(struct jit_state *state, int src, int dst) {
+    emit_alu32(state, 0x39, src, dst);
+}
+
+static inline void emit_jcc(struct jit_state *state, int code, int32_t target_pc) {
+    emit1(state, 0x0f);
+    emit1(state, code);
+    emit_jump_target_address(state, target_pc);
+}
+
+/* Load [src + offset] into dst */
+static inline void emit_load(struct jit_state *state, enum operand_size size, int src, int dst, int32_t offset) {
+    emit_basic_rex(state, size == S64, dst, src);
+
+    if (size == S8 || size == S16) {
+        /* movzx */
+        emit1(state, 0x0f);
+        emit1(state, size == S8 ? 0xb6 : 0xb7);
+    } else if (size == S32 || size == S64) {
+        /* mov */
+        emit1(state, 0x8b);
+    }
+
+    emit_modrm_and_displacement(state, dst, src, offset);
+}
+
+/* Load sign-extended immediate into register */
+static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm) {
+    if (imm >= INT32_MIN && imm <= INT32_MAX) {
+        emit_alu64_imm32(state, 0xc7, 0, dst, imm);
+    } else {
+        /* movabs $imm,dst */
+        emit_basic_rex(state, 1, 0, dst);
+        emit1(state, 0xb8 | (dst & 7));
+        emit8(state, imm);
+    }
+}
+
+/* Load sign-extended immediate into register */
+static inline void emit_load_relative(struct jit_state *state, int target_pc) {
+    if (state->num_loads == LIBEBPF_MAX_INSTRUCTION_COUNT) {
+        return;
+    }
+    emit1(state, 0x48);
+    emit1(state, 0x8b);
+    emit1(state, 0x05);
+    struct patchable_relative *load = &state->loads[state->num_loads++];
+    load->offset_loc = state->offset;
+    load->target_pc = target_pc;
+    emit4(state, 0);
+}
+
+/* Store register src to [dst + offset] */
+static inline void emit_store(struct jit_state *state, enum operand_size size, int src, int dst, int32_t offset) {
+    if (size == S16) {
+        emit1(state, 0x66); /* 16-bit override */
+    }
+    int rexw = size == S64;
+    if (rexw || src & 8 || dst & 8 || size == S8) {
+        emit_rex(state, rexw, !!(src & 8), 0, !!(dst & 8));
+    }
+    emit1(state, size == S8 ? 0x88 : 0x89);
+    emit_modrm_and_displacement(state, src, dst, offset);
+}
+
+/* Store immediate to [dst + offset] */
+static inline void emit_store_imm32(struct jit_state *state, enum operand_size size, int dst, int32_t offset, int32_t imm) {
+    if (size == S16) {
+        emit1(state, 0x66); /* 16-bit override */
+    }
+    emit_basic_rex(state, size == S64, 0, dst);
+    emit1(state, size == S8 ? 0xc6 : 0xc7);
+    emit_modrm_and_displacement(state, 0, dst, offset);
+    if (size == S32 || size == S64) {
+        emit4(state, imm);
+    } else if (size == S16) {
+        emit2(state, imm);
+    } else if (size == S8) {
+        emit1(state, imm);
+    }
+}
+
+static inline void emit_ret(struct jit_state *state) {
+    emit1(state, 0xc3);
+}
+
+static inline void emit_jmp(struct jit_state *state, uint32_t target_pc) {
+    emit1(state, 0xe9);
+    emit_jump_target_address(state, target_pc);
+}
+
+static inline void emit_dispatched_external_helper_call(struct jit_state *state, const struct ebpf_vm *vm, unsigned int idx) {
+    /*
+     * When we enter here, our stack is 16-byte aligned. Keep
+     * it that way!
+     */
+
+    // Save r9 -- I need it for a parameter!
+    emit_push(state, R9);
+
+    // Before it's a parameter, use it for a push.
+    // Set vm instance itself as cookie. so we can access the vm instance when helper executes
+    emit_load_imm(state, R9, (uintptr_t)vm);
+    emit_push(state, R9);
+
+    emit_load_imm(state, R9, (uint64_t)idx);
+    emit_load_relative(state, TARGET_PC_EXTERNAL_DISPATCHER);
+
+#ifndef UBPF_DISABLE_RETPOLINES
+    emit1(state, 0xe8); // e8 is the opcode for a CALL
+    emit_jump_target_address(state, TARGET_PC_RETPOLINE);
+#else
+    /* TODO use direct call when possible */
+    /* callq *%rax */
+    emit1(state, 0xff);
+    // ModR/M byte: b11010000b = xd
+    //               ^
+    //               register-direct addressing.
+    //                 ^
+    //                 opcode extension (2)
+    //                    ^
+    //                    rax is register 0
+    emit1(state, 0xd0);
+#endif
+
+    // The result is in RAX. Nothing to do there.
+    // Just rationalize the stack!
+
+    emit_pop(state, R9); // First one is a throw away (it's where our parameter was!)
+    emit_pop(state, R9); // This one is real!
+}
+
+#endif
diff --git a/src/libebpf.c b/src/libebpf.c
index ba3cd66..1ea8b8c 100644
--- a/src/libebpf.c
+++ b/src/libebpf.c
@@ -1,13 +1,43 @@
 #include "libebpf_insn.h"
-#include <asm-generic/errno-base.h>
+#include "libebpf_vm.h"
+#include <errno.h>
+#include <stdbool.h>
 #include <string.h>
 #include <libebpf.h>
 #include <stdlib.h>
 #include <libebpf_internal.h>
 
+#define IS_UNIX_LIKE defined(__unix__) || defined(__linux__)
+#ifdef IS_UNIX_LIKE
+#include <sys/mman.h>
+#endif
 ebpf_malloc _libebpf_global_malloc = &malloc;
 ebpf_free _libebpf_global_free = &free;
 ebpf_realloc _libebpf_global_realloc = &realloc;
+#ifdef IS_UNIX_LIKE
+static void *allocate_and_copy(void *buf, size_t bufsize) {
+    void *mem = mmap(0, bufsize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (mem == MAP_FAILED) {
+        ebpf_set_error_string("Unable to mmap");
+        return NULL;
+    }
+    memcpy(mem, buf, bufsize);
+    if (mprotect(mem, bufsize, PROT_READ | PROT_EXEC) < 0) {
+        ebpf_set_error_string("Unable to set mprotect");
+        munmap(mem, bufsize);
+        return NULL;
+    }
+    return mem;
+}
+
+ebpf_allocate_execuable_memory_and_copy _libebpf_executable_allocator = &allocate_and_copy;
+ebpf_release_executable_memory _libebpf_executor_release = &munmap;
+#else
+
+ebpf_allocate_execuable_memory_and_copy _libebpf_executable_allocator = NULL;
+ebpf_release_executable_memory _libebpf_executor_release = NULL;
+#endif
+
 char _libebpf_global_error_string[1024] = "";
 
 void ebpf_set_global_memory_allocator(ebpf_malloc malloc, ebpf_free free, ebpf_realloc realloc) {
@@ -40,6 +70,13 @@ void ebpf_vm_destroy(ebpf_vm_t *vm) {
     _libebpf_global_free(vm->helpers);
     if (vm->insns)
         _libebpf_global_free(vm->insns);
+    if (vm->begin_of_local_function)
+        _libebpf_global_free(vm->begin_of_local_function);
+    if (vm->translated_code)
+        _libebpf_global_free(vm->translated_code);
+    if (vm->jit_mapped_page) {
+        _libebpf_executor_release(vm->jit_mapped_page, vm->jit_size);
+    }
     _libebpf_global_free(vm);
 }
 
@@ -63,19 +100,50 @@ int ebpf_vm_load_instructions(ebpf_vm_t *vm, const struct libebpf_insn *code, si
         return -EEXIST;
     }
     vm->insns = _libebpf_global_malloc(sizeof(struct libebpf_insn) * code_len);
+    vm->begin_of_local_function = _libebpf_global_malloc(sizeof(bool) * code_len);
     if (!vm->insns) {
         ebpf_set_error_string("Failed to call malloc");
         return -ENOMEM;
     }
+    if (!vm->begin_of_local_function) {
+        ebpf_set_error_string("Failed to call malloc");
+        _libebpf_global_free(vm->insns);
+        return -ENOMEM;
+    }
+    vm->insn_cnt = code_len;
+    memset(vm->begin_of_local_function, 0, sizeof(bool) * vm->insn_cnt);
     memcpy(vm->insns, code, sizeof(struct libebpf_insn) * code_len);
+
+    for (size_t i = 0; i < vm->insn_cnt; i++) {
+        if (code[i].code == (BPF_CLASS_JMP | BPF_SOURCE_K | BPF_JMP_CALL) || code[i].code == (BPF_CLASS_JMP | BPF_SOURCE_X | BPF_JMP_CALL) ||
+            code[i].code == (BPF_CLASS_JMP32 | BPF_SOURCE_K | BPF_JMP_CALL) || code[i].code == (BPF_CLASS_JMP32 | BPF_SOURCE_X | BPF_JMP_CALL)) {
+            if (code[i].src_reg == 1) {
+                // Call to a local function
+                uint32_t target = i + vm->insns[i].imm + 1;
+                vm->begin_of_local_function[target] = true;
+            }
+        }
+    }
     return 0;
 }
 void ebpf_vm_unload_instructions(ebpf_vm_t *vm) {
     if (vm->insns) {
         _libebpf_global_free(vm->insns);
         vm->insns = NULL;
+        _libebpf_global_free(vm->begin_of_local_function);
+        vm->begin_of_local_function = NULL;
         vm->insn_cnt = 0;
     }
+    if (vm->translated_code) {
+        _libebpf_global_free(vm->translated_code);
+        vm->translated_code = NULL;
+        vm->translated_code_size = 0;
+    }
+    if (vm->jit_mapped_page) {
+        _libebpf_executor_release(vm->jit_mapped_page, vm->jit_size);
+        vm->jit_mapped_page = NULL;
+        vm->jit_size = 0;
+    }
 }
 
 void ebpf_vm_set_ld64_helpers(ebpf_vm_t *vm, ebpf_map_by_fd_callback map_by_fd, ebpf_map_by_idx_callback map_by_idx, ebpf_map_val_callback map_val,
@@ -86,3 +154,43 @@ void ebpf_vm_set_ld64_helpers(ebpf_vm_t *vm, ebpf_map_by_fd_callback map_by_fd,
     vm->map_by_idx = map_by_idx;
     vm->map_val = map_val;
 }
+
+static int prepare_translated_code(ebpf_vm_t *vm) {
+    if (vm->translated_code)
+        return 0;
+    return ebpf_translate(vm, &vm->translated_code, &vm->translated_code_size);
+}
+
+static int prepare_executable_page(ebpf_vm_t *vm) {
+    int err;
+    err = prepare_translated_code(vm);
+    if (err < 0) {
+        goto out;
+    }
+    if (_libebpf_executable_allocator != NULL) {
+        void *page = _libebpf_executable_allocator(vm->translated_code, vm->translated_code_size);
+        if (!page) {
+            err = -1;
+            goto out;
+        }
+        vm->jit_mapped_page = page;
+        vm->jit_size = vm->translated_code_size;
+    } else {
+        ebpf_set_error_string("Executable page allocator has not been set");
+        err = -1;
+        goto out;
+    }
+
+out:
+    return err;
+}
+
+ebpf_jit_fn ebpf_vm_compile(ebpf_vm_t *vm) {
+    if (prepare_translated_code(vm) < 0) {
+        return NULL;
+    }
+    if (prepare_executable_page(vm) < 0) {
+        return NULL;
+    }
+    return (ebpf_jit_fn)vm->jit_mapped_page;
+}
diff --git a/src/libebpf_internal.h b/src/libebpf_internal.h
index 85b411b..d255a96 100644
--- a/src/libebpf_internal.h
+++ b/src/libebpf_internal.h
@@ -13,15 +13,30 @@ struct ebpf_external_helper_definition {
 };
 
 struct ebpf_vm {
+    // Helper definitions
     struct ebpf_external_helper_definition *helpers;
+    // Count of loaded instructions
     size_t insn_cnt;
+    // Loaded instructions. NULL means not loaded yet
     struct libebpf_insn *insns;
+    // LDDW helpers
     ebpf_map_by_fd_callback map_by_fd;
     ebpf_map_by_idx_callback map_by_idx;
     ebpf_map_val_callback map_val;
     ebpf_code_addr_callback code_addr;
     ebpf_var_addr_callback var_addr;
+    // Enable bounds check?
     bool bounds_check_enabled;
+    // Whether the pc marks the start of a local function
+    bool *begin_of_local_function;
+
+    // Translated code
+    uint8_t *translated_code;
+    size_t translated_code_size;
+
+    // Mapped pagefor execution
+    void *jit_mapped_page;
+    size_t jit_size;
 };
 
 extern char _libebpf_global_error_string[1024];
@@ -51,15 +66,15 @@ static inline int bit_test_mask(uint64_t m, uint64_t msk, uint64_t pat) {
 
 /**
  * @brief Only for unit tests. Directly call a helper
- * 
- * @param vm 
- * @param idx 
- * @param a 
- * @param b 
- * @param c 
- * @param d 
- * @param e 
- * @return uint64_t 
+ *
+ * @param vm
+ * @param idx
+ * @param a
+ * @param b
+ * @param c
+ * @param d
+ * @param e
+ * @return uint64_t
  */
 static inline uint64_t ebpf_vm_call_helper(ebpf_vm_t *vm, int idx, uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
     return vm->helpers[idx].fn(a, b, c, d, e);
@@ -68,4 +83,7 @@ static inline uint64_t ebpf_vm_call_helper(ebpf_vm_t *vm, int idx, uint64_t a, u
 extern ebpf_malloc _libebpf_global_malloc;
 extern ebpf_free _libebpf_global_free;
 extern ebpf_realloc _libebpf_global_realloc;
+
+extern ebpf_allocate_execuable_memory_and_copy _libebpf_executable_allocator;
+extern ebpf_release_executable_memory _libebpf_executor_release;
 #endif
diff --git a/src/libebpf_vm.c b/src/libebpf_vm.c
index 4fab41e..92f80a7 100644
--- a/src/libebpf_vm.c
+++ b/src/libebpf_vm.c
@@ -23,7 +23,7 @@ static inline bool ebpf_runtime_bound_check(const struct ebpf_vm *vm, void *addr
     } else if (addr >= stack && ((char *)addr + size) <= ((char *)stack + EBPF_STACK_SIZE)) {
         return true;
     } else {
-        ebpf_set_error_string("ebpf error: out of bounds memory %s at PC %u, addr %p, size %d\nmem %p/%zd stack %p/%d\n", type, cur_pc, addr, size,
+        ebpf_set_error_string("ebpf error: out of bounds memory %s at PC %u, addr %p, size %d, mem %p/%zd stack %p/%d", type, cur_pc, addr, size,
                               mem, mem_len, stack, EBPF_STACK_SIZE);
         return false;
     }
diff --git a/vm-test/test-cases/err-stack-oob.data b/vm-test/test-cases/err-stack-oob.data
index 5ecd871..ca16714 100644
--- a/vm-test/test-cases/err-stack-oob.data
+++ b/vm-test/test-cases/err-stack-oob.data
@@ -2,7 +2,7 @@
 stb [r10], 0
 exit
 -- error pattern
-ebpf error: out of bounds memory store at PC 0, addr .*, size 1
+ebpf error: out of bounds memory store at PC 0, addr .*, size 1, mem .* stack .*
 -- result
 0xffffffffffffffff
 -- no jit
diff --git a/vm-test/test_framework/test_jit.py b/vm-test/test_framework/test_jit.py
new file mode 100644
index 0000000..5abd51c
--- /dev/null
+++ b/vm-test/test_framework/test_jit.py
@@ -0,0 +1,111 @@
+import os
+import platform
+import tempfile
+import struct
+import re
+from subprocess import Popen, PIPE
+from nose.plugins.skip import Skip, SkipTest
+import ebpf.assembler
+import testdata
+import pytest
+VM = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                  "../..", "build/vm-test/libebpf_test_runner")
+_test_data_dir = os.path.join(os.path.dirname(
+    os.path.realpath(__file__)), "../test-cases")
+try:
+    xrange
+except NameError:
+    xrange = range
+
+
+def jit_supported_platform():
+    """Is the JIT supported on the current platform."""
+    return platform.machine() in ['amd64', 'x86_64', 'arm64', 'aarch64']
+
+
+def check_datafile(filename):
+    print("Process", filename)
+    """
+    Given assembly source code and an expected result, run the eBPF program and
+    verify that the result matches. Uses the JIT compiler.
+    """
+    if not jit_supported_platform():
+        raise SkipTest("JIT is not supported on the current platform")
+
+    data = testdata.read(_test_data_dir, filename)
+    if 'asm' not in data and 'raw' not in data:
+        raise SkipTest("no asm or raw section in datafile")
+    if 'result' not in data and 'error' not in data and 'error pattern' not in data:
+        raise SkipTest("no result or error section in datafile")
+    if not os.path.exists(VM):
+        raise SkipTest("VM not found")
+    if 'no jit' in data:
+        raise SkipTest("JIT disabled for this testcase (%s)" % data['no jit'])
+
+    if 'raw' in data:
+        code = b''.join(struct.pack("=Q", x) for x in data['raw'])
+    else:
+        code = ebpf.assembler.assemble(data['asm'])
+
+    memfile = None
+
+    if 'mem' in data:
+        memfile = tempfile.NamedTemporaryFile()
+        memfile.write(data['mem'])
+        memfile.flush()
+
+    num_register_offsets = 20
+    if 'no register offset' in data:
+        # The JIT relies on a fixed register mapping for the call instruction
+        num_register_offsets = 1
+
+    try:
+        cmd = [VM]
+        if memfile:
+            cmd.extend(['-m', memfile.name])
+        if 'reload' in data:
+            cmd.extend(['-R'])
+        if 'unload' in data:
+            cmd.extend(['-U'])
+        cmd.extend(['-j', '-'])
+
+        vm = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
+
+        stdout, stderr = vm.communicate(code)
+        stdout = stdout.decode("utf-8")
+        stderr = stderr.decode("utf-8")
+        stderr = stderr.strip()
+
+        if 'error' in data:
+            if data['error'] != stderr:
+                raise AssertionError(
+                    "Expected error %r, got %r" % (data['error'], stderr))
+        elif 'error pattern' in data:
+            if not re.search(data['error pattern'], stderr):
+                raise AssertionError("Expected error matching %r, got %r" % (
+                    data['error pattern'], stderr))
+        else:
+            if stderr:
+                raise AssertionError("Unexpected error %r" % stderr)
+
+        if 'result' in data:
+            if vm.returncode != 0:
+                raise AssertionError(
+                    "VM exited with status %d, stderr=%r" % (vm.returncode, stderr))
+            expected = int(data['result'], 0)
+            result = int(stdout, 0)
+            if expected != result:
+                raise AssertionError(
+                    "Expected result 0x%x, got 0x%x, stderr=%r" % (expected, result, stderr))
+        else:
+            if vm.returncode == 0:
+                raise AssertionError("Expected VM to exit with an error code")
+    finally:
+        if memfile:
+            memfile.close()
+
+@pytest.mark.parametrize("filename", testdata.list_files(_test_data_dir))
+# @pytest.mark.parametrize("filename", ["rsh32.data"])
+def test_datafiles(filename):
+    # This is now a regular test function that will be called once for each filename
+    check_datafile(filename)
diff --git a/vm-test/test_runner/test_runner.c b/vm-test/test_runner/test_runner.c
index d58539f..5fb85fa 100644
--- a/vm-test/test_runner/test_runner.c
+++ b/vm-test/test_runner/test_runner.c
@@ -59,14 +59,15 @@ int main(int argc, char **argv) {
     bool reload = false;
 
     int opt;
-    while ((opt = getopt_long(argc, argv, "hm:UR", longopts, NULL)) != -1) {
+    bool jit = false;
+    while ((opt = getopt_long(argc, argv, "hm:URj", longopts, NULL)) != -1) {
         switch (opt) {
         case 'm':
             mem_filename = optarg;
             break;
-            // case 'j':
-            //     jit = true;
-            //     break;
+        case 'j':
+            jit = true;
+            break;
             //         case 'r':
             // #if defined(__x86_64__) || defined(_M_X64)
             //             ebpf_set_register_offset(atoi(optarg));
@@ -161,22 +162,21 @@ int main(int argc, char **argv) {
 
     uint64_t ret;
 
-    // if (jit) {
-    //     ebpf_jit_fn fn = ebpf_compile(vm, &errmsg);
-    //     if (fn == NULL) {
-    //         fprintf(stderr, "Failed to compile: %s\n", errmsg);
-    //         free(errmsg);
-    //         free(mem);
-    //         return 1;
-    //     }
-    //     ret = fn(mem, mem_len);
-    // } else {
-
-    // }
-    if (ebpf_vm_run(vm, mem, mem_len, &ret) < 0) {
-        ret = UINT64_MAX;
-        fprintf(stderr, "%s", ebpf_error_string());
+    if (jit) {
+        ebpf_jit_fn fn = ebpf_vm_compile(vm);
+        if (fn == NULL) {
+            fprintf(stderr, "Failed to compile: %s\n", errmsg);
+            free(mem);
+            return 1;
+        }
+        ret = fn(mem, mem_len);
+    } else {
+        if (ebpf_vm_run(vm, mem, mem_len, &ret) < 0) {
+            ret = UINT64_MAX;
+            fprintf(stderr, "%s", ebpf_error_string());
+        }
     }
+
     printf("0x%" PRIx64 "\n", ret);
 
     ebpf_vm_destroy(vm);