add CONFIG_BKL_SWAP_MANUAL

Signed-off-by: Axel Heider <axelheider@gmx.de>
axel-h · Nov 10, 2024 · 4ea3b9f · 4ea3b9f
1 parent 3a79a19
commit 4ea3b9f
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 60 deletions.
diff --git a/include/arch/arm/arch/model/smp.h b/include/arch/arm/arch/model/smp.h
@@ -16,47 +16,52 @@ static inline cpu_id_t cpuIndexToID(word_t index)
     return BIT(index);
 }
 
-static inline bool_t try_arch_atomic_exchange_rlx(void *ptr, void *new_val, void **prev)
+#ifdef CONFIG_BKL_SWAP_MANUAL
+static inline void *arch_atomic_exchange(void *head, void *node)
 {
-    /* The ARM architecture had an atomic swap instruction in ARMv6, which got
-     * deprecated and war replaced by exclusive load and store instructions in
-     * ARMv7. The compiler builtin __atomic_exchange_n() executed them in a loop
-     * as long as the exclusivity fail fails. Unfortunately, this approach can
-     * results in an undefined delay up to seconds in the worst case if other
-     * cores touch memory in the same exclusivity reservation granule. The
-     * actual granule size is implementation specific. The explicit load and
-     * store instruction below and reporting back success or failure is the only
-     * way to work around this, the caller can loop over calling this functions
-     * this and perform other maintenance operation or wait on failure.
+    /* ARMv6 had an atomic swap instruction. ARMv7 deprecated is and recommended
+     * using exclusive load and store instructions, which the compiler builtin
+     * __atomic_exchange_n() executes in a loop as long as the exclusivity fail
+     * fails. Unfortunately, this approach can results in an undefined delay up
+     * to seconds in the worst case if other cores touch memory in the same
+     * exclusivity reservation granule. The actual granule size is
+     * implementation specific, for modern SMP optimized cores the size is
+     * expected to be small.
      * ARM v8.1 eventually brought the atomic swap instruction back as part of
      * LSE (Large System Extensions) and GCC support was added in v9.4. Thus,
      * using __atomic_exchange_n() becomes an option when "-march=armv8.1-a" is
      * passed on platform that use core implementing ARMv8.1 or higher. Details
      * remain to be analyzed.
      */
-    uint32_t atomic_status;
-    void *temp;
-
-    asm volatile(
-        LD_EX "%[prev_output], [%[ptr_val]]             \n\t" /* ret = *ptr */
-        ST_EX "%" OP_WIDTH "[atomic_var], %[new_val] , [%[ptr_val]] \n\t"  /* *ptr = new */
-        : [atomic_var] "=&r"(atomic_status), [prev_output]"=&r"(temp)     /* output */
-        : [ptr_val] "r"(ptr), [new_val] "r"(new_val)   /* input */
-        :
-    );
-
-    if (0 != atomic_status) {
-        /* Specs say 0 indicates success and 1 a exclusivity failure, any other
-         * value is not defined. If the atomic operation has failed, prev is
-         * left untouched. There seems no gain updating it with the value
-         * obtained, as this might no longer be valid anyway.
+    __atomic_thread_fence(__ATOMIC_RELEASE); /* all writes must finish */
+
+    for (;;)  {
+        void *prev;
+        uint32_t atomic_status;
+        asm volatile(
+            LD_EX "%[prev_output], [%[ptr_val]]                         \n\t" /* prev = *head */
+            ST_EX "%" OP_WIDTH "[atomic_var], %[new_val] , [%[ptr_val]] \n\t" /* *head = node */
+            : /* output */ [atomic_var] "=&r"(atomic_status), [prev_output]"=&r"(prev)
+            : /* input */ [ptr_val] "r"(head), [new_val] "r"(node)
+            : /* no clobber */
+        );
+
+        /* Specs say 0 indicates success and 1 an exclusivity failure, any other
+         * value is not defined.
          */
-        return false;
+        if (0 == atomic_status) {
+            /* The write for the update of the queue write has finished anyway,
+             * prevent pipeline from starting any reads before passing here
+             */
+            __atomic_thread_fence(__ATOMIC_ACQUIRE);
+            return prev;
+        }
+
+        /* keep spinning */
     }
 
-    *prev = temp;
-    return true;
 }
+#endif /* CONFIG_BKL_SWAP_MANUAL */
 
 #endif /* ENABLE_SMP_SUPPORT */
 
diff --git a/include/arch/riscv/arch/model/smp.h b/include/arch/riscv/arch/model/smp.h
@@ -43,12 +43,6 @@ static inline void add_hart_to_core_map(word_t hart_id, word_t core_id)
     coreMap.map[core_id] = hart_id;
 }
 
-static inline bool_t try_arch_atomic_exchange_rlx(void *ptr, void *new_val, void **prev)
-{
-    *prev = __atomic_exchange_n((void **)ptr, new_val, __ATOMIC_RELAXED);
-    return true;
-}
-
 static inline CONST cpu_id_t getCurrentCPUIndex(void)
 {
     word_t sp;

diff --git a/include/arch/x86/arch/model/smp.h b/include/arch/x86/arch/model/smp.h
@@ -36,10 +36,4 @@ static inline PURE word_t getCurrentCPUID(void)
     return cpu_mapping.index_to_cpu_id[getCurrentCPUIndex()];
 }
 
-static inline bool_t try_arch_atomic_exchange_rlx(void *ptr, void *new_val, void **prev)
-{
-    *prev = __atomic_exchange_n((void **) ptr, new_val, __ATOMIC_RELAXED);
-    return true;
-}
-
 #endif /* ENABLE_SMP_SUPPORT */
diff --git a/include/config.h b/include/config.h
@@ -18,3 +18,12 @@
 #define AARCH64_VSPACE_S2_START_L1
 #endif
 #endif
+
+#ifdef CONFIG_ARCH_ARM
+/* ToDo: Check if we can disable this by default and only enable this for
+ *       platforms where __atomic_exchange_n() can't be used. Document the
+ *       reason then, as it might just be a compiler limitation in the end that
+ *       is lifted in a future release.
+ */
+#define CONFIG_BKL_SWAP_MANUAL
+#endif
diff --git a/include/smp/lock.h b/include/smp/lock.h
@@ -61,24 +61,15 @@ static inline void FORCE_INLINE clh_lock_acquire(word_t cpu, bool_t irqPath)
 {
     clh_qnode_p_t volatile *node_owner = &big_kernel_lock.node_owners[cpu];
     node_owner->node->value = CLHState_Pending;
-
-    __atomic_thread_fence(__ATOMIC_RELEASE); /* writes must finish */
-    /* Unfortunately, the compiler builtin __atomic_exchange_n() cannot be used
-     * here, because some architectures lack an actual atomic swap instruction
-     * and spin in a tight loop instead. The spinning duration is undefined,
-     * tests have shown it could be seconds in the worst case.
-     * Performance evaluation has also shown, that it's best to have fences just
-     * before and after a loop over one relaxed atomic exchange attempt.
-     * The content of big_kernel_lock.node_owners[cpu].next will only be
-     * modified if the atomic exchange way successful, otherwise it is left
-     * untouched.
-     */
-    while (!try_arch_atomic_exchange_rlx(&big_kernel_lock.head,
-                                         node_owner->node,
-                                         &node_owner->next)) {
-        /* busy waiting */
-    }
-    __atomic_thread_fence(__ATOMIC_ACQUIRE); /* prevent reads before passing here */
+
+#ifdef CONFIG_BKL_SWAP_MANUAL
+    node_owner->next = arch_atomic_exchange(&big_kernel_lock.head,
+                                            node_owner->node);
+#else
+    node_owner->next = __atomic_exchange_n(&big_kernel_lock.head,
+                                           node_owner->node,
+                                           __ATOMIC_ACQ_REL);
+#endif
 
     /* We do not have an __atomic_thread_fence here as this is already handled by the
      * atomic_exchange just above */