diff --git a/include/arch/arm/arch/model/smp.h b/include/arch/arm/arch/model/smp.h index 258617d28ff..6d4168839a7 100644 --- a/include/arch/arm/arch/model/smp.h +++ b/include/arch/arm/arch/model/smp.h @@ -16,47 +16,52 @@ static inline cpu_id_t cpuIndexToID(word_t index) return BIT(index); } -static inline bool_t try_arch_atomic_exchange_rlx(void *ptr, void *new_val, void **prev) +#ifdef CONFIG_BKL_SWAP_MANUAL +static inline void *arch_atomic_exchange(void *head, void *node) { - /* The ARM architecture had an atomic swap instruction in ARMv6, which got - * deprecated and war replaced by exclusive load and store instructions in - * ARMv7. The compiler builtin __atomic_exchange_n() executed them in a loop - * as long as the exclusivity fail fails. Unfortunately, this approach can - * results in an undefined delay up to seconds in the worst case if other - * cores touch memory in the same exclusivity reservation granule. The - * actual granule size is implementation specific. The explicit load and - * store instruction below and reporting back success or failure is the only - * way to work around this, the caller can loop over calling this functions - * this and perform other maintenance operation or wait on failure. + /* ARMv6 had an atomic swap instruction. ARMv7 deprecated is and recommended + * using exclusive load and store instructions, which the compiler builtin + * __atomic_exchange_n() executes in a loop as long as the exclusivity fail + * fails. Unfortunately, this approach can results in an undefined delay up + * to seconds in the worst case if other cores touch memory in the same + * exclusivity reservation granule. The actual granule size is + * implementation specific, for modern SMP optimized cores the size is + * expected to be small. * ARM v8.1 eventually brought the atomic swap instruction back as part of * LSE (Large System Extensions) and GCC support was added in v9.4. Thus, * using __atomic_exchange_n() becomes an option when "-march=armv8.1-a" is * passed on platform that use core implementing ARMv8.1 or higher. Details * remain to be analyzed. */ - uint32_t atomic_status; - void *temp; - - asm volatile( - LD_EX "%[prev_output], [%[ptr_val]] \n\t" /* ret = *ptr */ - ST_EX "%" OP_WIDTH "[atomic_var], %[new_val] , [%[ptr_val]] \n\t" /* *ptr = new */ - : [atomic_var] "=&r"(atomic_status), [prev_output]"=&r"(temp) /* output */ - : [ptr_val] "r"(ptr), [new_val] "r"(new_val) /* input */ - : - ); - - if (0 != atomic_status) { - /* Specs say 0 indicates success and 1 a exclusivity failure, any other - * value is not defined. If the atomic operation has failed, prev is - * left untouched. There seems no gain updating it with the value - * obtained, as this might no longer be valid anyway. + __atomic_thread_fence(__ATOMIC_RELEASE); /* all writes must finish */ + + for (;;) { + void *prev; + uint32_t atomic_status; + asm volatile( + LD_EX "%[prev_output], [%[ptr_val]] \n\t" /* prev = *head */ + ST_EX "%" OP_WIDTH "[atomic_var], %[new_val] , [%[ptr_val]] \n\t" /* *head = node */ + : /* output */ [atomic_var] "=&r"(atomic_status), [prev_output]"=&r"(prev) + : /* input */ [ptr_val] "r"(head), [new_val] "r"(node) + : /* no clobber */ + ); + + /* Specs say 0 indicates success and 1 an exclusivity failure, any other + * value is not defined. */ - return false; + if (0 == atomic_status) { + /* The write for the update of the queue write has finished anyway, + * prevent pipeline from starting any reads before passing here + */ + __atomic_thread_fence(__ATOMIC_ACQUIRE); + return prev; + } + + /* keep spinning */ } - *prev = temp; - return true; } +#endif /* CONFIG_BKL_SWAP_MANUAL */ #endif /* ENABLE_SMP_SUPPORT */ diff --git a/include/arch/riscv/arch/model/smp.h b/include/arch/riscv/arch/model/smp.h index 99699505536..9241d91fa62 100644 --- a/include/arch/riscv/arch/model/smp.h +++ b/include/arch/riscv/arch/model/smp.h @@ -43,12 +43,6 @@ static inline void add_hart_to_core_map(word_t hart_id, word_t core_id) coreMap.map[core_id] = hart_id; } -static inline bool_t try_arch_atomic_exchange_rlx(void *ptr, void *new_val, void **prev) -{ - *prev = __atomic_exchange_n((void **)ptr, new_val, __ATOMIC_RELAXED); - return true; -} - static inline CONST cpu_id_t getCurrentCPUIndex(void) { word_t sp; diff --git a/include/arch/x86/arch/model/smp.h b/include/arch/x86/arch/model/smp.h index f73bcb8a9ee..aac276418ad 100644 --- a/include/arch/x86/arch/model/smp.h +++ b/include/arch/x86/arch/model/smp.h @@ -36,10 +36,4 @@ static inline PURE word_t getCurrentCPUID(void) return cpu_mapping.index_to_cpu_id[getCurrentCPUIndex()]; } -static inline bool_t try_arch_atomic_exchange_rlx(void *ptr, void *new_val, void **prev) -{ - *prev = __atomic_exchange_n((void **) ptr, new_val, __ATOMIC_RELAXED); - return true; -} - #endif /* ENABLE_SMP_SUPPORT */ diff --git a/include/config.h b/include/config.h index d3418b8ad0a..93dcec4fa76 100644 --- a/include/config.h +++ b/include/config.h @@ -18,3 +18,12 @@ #define AARCH64_VSPACE_S2_START_L1 #endif #endif + +#ifdef CONFIG_ARCH_ARM +/* ToDo: Check if we can disable this by default and only enable this for + * platforms where __atomic_exchange_n() can't be used. Document the + * reason then, as it might just be a compiler limitation in the end that + * is lifted in a future release. + */ +#define CONFIG_BKL_SWAP_MANUAL +#endif diff --git a/include/smp/lock.h b/include/smp/lock.h index f7bddb404c9..86fe3360e9f 100644 --- a/include/smp/lock.h +++ b/include/smp/lock.h @@ -61,24 +61,15 @@ static inline void FORCE_INLINE clh_lock_acquire(word_t cpu, bool_t irqPath) { clh_qnode_p_t volatile *node_owner = &big_kernel_lock.node_owners[cpu]; node_owner->node->value = CLHState_Pending; - - __atomic_thread_fence(__ATOMIC_RELEASE); /* writes must finish */ - /* Unfortunately, the compiler builtin __atomic_exchange_n() cannot be used - * here, because some architectures lack an actual atomic swap instruction - * and spin in a tight loop instead. The spinning duration is undefined, - * tests have shown it could be seconds in the worst case. - * Performance evaluation has also shown, that it's best to have fences just - * before and after a loop over one relaxed atomic exchange attempt. - * The content of big_kernel_lock.node_owners[cpu].next will only be - * modified if the atomic exchange way successful, otherwise it is left - * untouched. - */ - while (!try_arch_atomic_exchange_rlx(&big_kernel_lock.head, - node_owner->node, - &node_owner->next)) { - /* busy waiting */ - } - __atomic_thread_fence(__ATOMIC_ACQUIRE); /* prevent reads before passing here */ + +#ifdef CONFIG_BKL_SWAP_MANUAL + node_owner->next = arch_atomic_exchange(&big_kernel_lock.head, + node_owner->node); +#else + node_owner->next = __atomic_exchange_n(&big_kernel_lock.head, + node_owner->node, + __ATOMIC_ACQ_REL); +#endif /* We do not have an __atomic_thread_fence here as this is already handled by the * atomic_exchange just above */