From abe5a5ab4cd04d7d088db9af5710cd2961272598 Mon Sep 17 00:00:00 2001 From: lzxddz Date: Wed, 14 Jun 2023 14:08:47 +0800 Subject: [PATCH 01/20] add resume_rq for remote task and update wait_task(),sched(),ending_sched(),ready_to_run_remote() of TaskGroup --- src/bthread/moodycamelqueue.h | 5255 ++++++++++++++++++++++++++++++ src/bthread/parking_lot.cpp | 7 + src/bthread/parking_lot.h | 5 + src/bthread/task_group.cpp | 86 +- src/bthread/task_group.h | 9 + src/bthread/task_group_inl.h | 21 + src/thirdparty/moodycamelqueue.h | 5255 ++++++++++++++++++++++++++++++ 7 files changed, 10613 insertions(+), 25 deletions(-) create mode 100644 src/bthread/moodycamelqueue.h create mode 100644 src/bthread/parking_lot.cpp create mode 100644 src/thirdparty/moodycamelqueue.h diff --git a/src/bthread/moodycamelqueue.h b/src/bthread/moodycamelqueue.h new file mode 100644 index 0000000000..d0d042f6b3 --- /dev/null +++ b/src/bthread/moodycamelqueue.h @@ -0,0 +1,5255 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free +// queue. An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2020, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +// Also dual-licensed under the Boost Software License (see LICENSE.md) + +#pragma once + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing +// warnings upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +// VS2019 with /W4 warns about constant conditional expressions but unless +// /std=c++17 or higher does not support `if constexpr`, so we have no choice +// but to simply disable the warning +#pragma warning(push) +#pragma warning(disable : 4127) // conditional expression is constant +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` +// method declarations. We'll override the default trait malloc ourselves +// without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include +#include +#include // for CHAR_BIT +#include // for max_align_t +#include +#include +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading +#include +#include + +// Platform-specific definitions of a numeric thread ID type and an invalid +// value +namespace moodycamel +{ +namespace details +{ +template +struct thread_id_converter +{ + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const &x) + { + return x; + } +}; +} // namespace details +} // namespace moodycamel +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel +{ +namespace details +{ +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; +static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; +static inline thread_id_t thread_id() +{ + return rl::thread_index(); +} +} // namespace details +} // namespace moodycamel +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the +// function we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId( + void); +namespace moodycamel +{ +namespace details +{ +static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), + "Expected size of unsigned long to be 32 bits on Windows"); +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = + 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx +static const thread_id_t invalid_thread_id2 = + 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used + // in practice. Note that all Win32 thread IDs are presently + // multiples of 4. +static inline thread_id_t thread_id() +{ + return static_cast(::GetCurrentThreadId()); +} +} // namespace details +} // namespace moodycamel +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ + (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel +{ +namespace details +{ +static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, + "std::thread::id is expected to be either 4 or 8 bytes"); + +typedef std::thread::id thread_id_t; +static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + +// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have +// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined +// anyway, which it won't be. +static inline thread_id_t thread_id() +{ + return std::this_thread::get_id(); +} + +template +struct thread_id_size +{ +}; +template <> +struct thread_id_size<4> +{ + typedef std::uint32_t numeric_t; +}; +template <> +struct thread_id_size<8> +{ + typedef std::uint64_t numeric_t; +}; + +template <> +struct thread_id_converter +{ + typedef thread_id_size::numeric_t + thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const &x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } +}; +} +} +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a +// thread-local static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel +{ +namespace details +{ +typedef std::uintptr_t thread_id_t; +static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr +static const thread_id_t invalid_thread_id2 = + 1; // Member accesses off a null pointer are also generally invalid. Plus + // it's not aligned. +inline thread_id_t thread_id() +{ + static MOODYCAMEL_THREADLOCAL int x; + return reinterpret_cast(&x); +} +} +} +#endif + +// Constexpr if +#ifndef MOODYCAMEL_CONSTEXPR_IF +#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \ + __cplusplus > 201402L +#define MOODYCAMEL_CONSTEXPR_IF if constexpr +#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] +#else +#define MOODYCAMEL_CONSTEXPR_IF if +#define MOODYCAMEL_MAYBE_UNUSED +#endif +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || \ + (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ + (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw(expr) +#else +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when +// it shouldn't :-( We have to assume *all* non-trivial constructors may throw +// on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value \ + : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value || \ + std::is_nothrow_move_constructible::value \ + : std::is_trivially_copy_constructible::value || \ + std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a +// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't +// support thread_local either. Finally, iOS/ARM doesn't have support for it +// either, and g++/ARM allows it to compile but it's unconfirmed to actually +// work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ + (!defined(__MINGW32__) && !defined(__MINGW64__) || \ + !defined(__WINPTHREADS_VERSION)) && \ + (!defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ + (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ + !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 +// compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now +// since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link +// error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +namespace moodycamel +{ +namespace details +{ +#ifndef MOODYCAMEL_ALIGNAS +// VS2013 doesn't support alignas or alignof, and align() requires a constant +// literal +#if defined(_MSC_VER) && _MSC_VER <= 1800 +#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) +#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ + typename details::Vs2013Aligned::value, T>::type +template +struct Vs2013Aligned +{ +}; // default, unsupported alignment +template +struct Vs2013Aligned<1, T> +{ + typedef __declspec(align(1)) T type; +}; +template +struct Vs2013Aligned<2, T> +{ + typedef __declspec(align(2)) T type; +}; +template +struct Vs2013Aligned<4, T> +{ + typedef __declspec(align(4)) T type; +}; +template +struct Vs2013Aligned<8, T> +{ + typedef __declspec(align(8)) T type; +}; +template +struct Vs2013Aligned<16, T> +{ + typedef __declspec(align(16)) T type; +}; +template +struct Vs2013Aligned<32, T> +{ + typedef __declspec(align(32)) T type; +}; +template +struct Vs2013Aligned<64, T> +{ + typedef __declspec(align(64)) T type; +}; +template +struct Vs2013Aligned<128, T> +{ + typedef __declspec(align(128)) T type; +}; +template +struct Vs2013Aligned<256, T> +{ + typedef __declspec(align(256)) T type; +}; +#else +template +struct identity +{ + typedef T type; +}; +#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) +#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ + alignas(alignof(obj)) typename details::identity::type +#endif +#endif +} // namespace details +} // namespace moodycamel + +// TSAN can false report races in lock-free code. To enable TSAN to be used +// from projects that use this one, we can apply per-function compile-time +// suppression. See +// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer +#define MOODYCAMEL_NO_TSAN +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#undef MOODYCAMEL_NO_TSAN +#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) +#endif // TSAN +#endif // TSAN + +// Compiler-specific likely/unlikely hints +namespace moodycamel +{ +namespace details +{ +#if defined(__GNUC__) +static inline bool(likely)(bool x) +{ + return __builtin_expect((x), true); +} +static inline bool(unlikely)(bool x) +{ + return __builtin_expect((x), false); +} +#else +static inline bool(likely)(bool x) +{ + return x; +} +static inline bool(unlikely)(bool x) +{ + return x; +} +#endif +} // namespace details +} // namespace moodycamel + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel +{ +namespace details +{ +template +struct const_numeric_max +{ + static_assert(std::is_integral::value, + "const_numeric_max can only be used with integers"); + static const T value = + std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - + static_cast(1) + : static_cast(-1); +}; + +#if defined(__GLIBCXX__) +typedef ::max_align_t + std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else +typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can + // *only* be accessed via std:: +#endif + +// Some platforms have incorrectly set max_align_t to a type with <8 bytes +// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit +// iOS). Work around this with our own union. See issue #64. +typedef union +{ + std_max_align_t x; + long long y; + void *z; +} max_align_t; +} // namespace details + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of + // elements you expect to hold at once, especially if you have a high + // turnover rate; for example, on 32-bit x86, if you expect to have over a + // hundred million elements or pump several million elements through your + // queue in a very short space of time, using a 32-bit type *may* trigger a + // race condition. A 64-bit int type is recommended in that case, and in + // practice will prevent a race condition no matter the usage of the queue. + // Note that whether the queue is lock-free with a 64-int type depends on + // the whether std::atomic is lock-free, which is + // platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few + // elements but many producers, a smaller block size should be favoured. For + // few producers and/or many elements, a larger block size is preferred. A + // sane default is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per + // element. For large block sizes, this is too inefficient, and switching to + // an atomic counter-based approach is faster. The switch is made for block + // sizes strictly larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This + // should reflect that number's maximum for optimal performance. Must be a + // power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This + // should reflect that number's maximum for optimal performance. Must be a + // power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit + // producers. Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit + // production (using the enqueue methods without an explicit producer token) + // is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a + // token) must consume before it causes all consumers to rotate and move on + // to the next internal queue. + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a + // sub-queue. Enqueue operations that would cause this limit to be surpassed + // will fail. Note that this limit is enforced at the block level (for + // performance reasons), i.e. it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = + details::const_numeric_max::value; + + // The number of times to spin before sleeping when waiting on a semaphore. + // Recommended values are on the order of 1000-10000 unless the number of + // consumer threads exceeds the number of idle cores (in which case try + // 0-100). Only affects instances of the BlockingConcurrentQueue. + static const int MAX_SEMA_SPINS = 10000; + +#ifndef MCDBGQ_USE_RELACY + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like + // std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void *WORKAROUND_malloc(size_t size) + { + return malloc(size); + } + static inline void WORKAROUND_free(void *ptr) + { + return free(ptr); + } + static inline void *(malloc) (size_t size) + { + return WORKAROUND_malloc(size); + } + static inline void(free)(void *ptr) + { + return WORKAROUND_free(ptr); + } +#else + static inline void *malloc(size_t size) + { + return std::malloc(size); + } + static inline void free(void *ptr) + { + return std::free(ptr); + } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void *malloc(size_t size) + { + return rl::rl_malloc(size, $); + } + static inline void free(void *ptr) + { + return rl::rl_free(ptr, $); + } +#endif +}; + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template +class ConcurrentQueue; +template +class BlockingConcurrentQueue; +class ConcurrentQueueTests; + +namespace details +{ +struct ConcurrentQueueProducerTypelessBase +{ + ConcurrentQueueProducerTypelessBase *next; + std::atomic inactive; + ProducerToken *token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } +}; + +template +struct _hash_32_or_64 +{ + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see + // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is + // propagate that uniqueness evenly across all the bits, so that we can + // use a subset of the bits while reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } +}; +template <> +struct _hash_32_or_64<1> +{ + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } +}; +template +struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> +{ +}; + +static inline size_t hash_thread_id(thread_id_t id) +{ + static_assert( + sizeof(thread_id_t) <= 8, + "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast( + hash_32_or_64::thread_id_hash_t)>:: + hash(thread_id_converter::prehash(id))); +} + +template +static inline bool circular_less_than(T a, T b) +{ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4554) +#endif + static_assert( + std::is_integral::value && !std::numeric_limits::is_signed, + "circular_less_than is intended to be used only with unsigned integer " + "types"); + return static_cast(a - b) > + static_cast(static_cast(1) + << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif +} + +template +static inline char *align_for(char *ptr) +{ + const std::size_t alignment = std::alignment_of::value; + return ptr + + (alignment - (reinterpret_cast(ptr) % alignment)) % + alignment; +} + +template +static inline T ceil_to_pow_2(T x) +{ + static_assert( + std::is_integral::value && !std::numeric_limits::is_signed, + "ceil_to_pow_2 is intended to be used only with unsigned integer " + "types"); + + // Adapted from + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) + { + x |= x >> (i << 3); + } + ++x; + return x; +} + +template +static inline void swap_relaxed(std::atomic &left, std::atomic &right) +{ + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), + std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); +} + +template +static inline T const &nomove(T const &x) +{ + return x; +} + +template +struct nomove_if +{ + template + static inline T const &eval(T const &x) + { + return x; + } +}; + +template <> +struct nomove_if +{ + template + static inline auto eval(U &&x) -> decltype(std::forward(x)) + { + return std::forward(x); + } +}; + +template +static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT->decltype(*it) +{ + return *it; +} + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +template +struct is_trivially_destructible : std::is_trivially_destructible +{ +}; +#else +template +struct is_trivially_destructible : std::has_trivial_destructor +{ +}; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +typedef RelacyThreadExitListener ThreadExitListener; +typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else +struct ThreadExitListener +{ + typedef void (*callback_t)(void *); + callback_t callback; + void *userData; + + ThreadExitListener *next; // reserved for use by the ThreadExitNotifier +}; + +class ThreadExitNotifier +{ +public: + static void subscribe(ThreadExitListener *listener) + { + auto &tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener *listener) + { + auto &tlsInst = instance(); + ThreadExitListener **prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) + { + if (ptr == listener) + { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + +private: + ThreadExitNotifier() : tail(nullptr) + { + } + ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier &operator=(ThreadExitNotifier const &) + MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && + "If this assert fails, you likely have a buggy compiler! Change " + "the preprocessor conditions such that " + "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) + { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier &instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + +private: + ThreadExitListener *tail; +}; +#endif +#endif + +template +struct static_is_lock_free_num +{ + enum + { + value = 0 + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_CHAR_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_SHORT_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_INT_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LONG_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LLONG_LOCK_FREE + }; +}; +template +struct static_is_lock_free + : static_is_lock_free_num::type> +{ +}; +template <> +struct static_is_lock_free +{ + enum + { + value = ATOMIC_BOOL_LOCK_FREE + }; +}; +template +struct static_is_lock_free +{ + enum + { + value = ATOMIC_POINTER_LOCK_FREE + }; +}; +} // namespace details + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue &queue); + + template + explicit ProducerToken(BlockingConcurrentQueue &queue); + + ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) + { + producer->token = this; + } + } + + inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) + { + producer->token = this; + } + if (other.producer != nullptr) + { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const + { + return producer != nullptr; + } + + ~ProducerToken() + { + if (producer != nullptr) + { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase *producer; +}; + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue &q); + + template + explicit ConsumerToken(BlockingConcurrentQueue &q); + + ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), + lastKnownGlobalOffset(other.lastKnownGlobalOffset), + itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), + currentProducer(other.currentProducer), + desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; + +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase *currentProducer; + details::ConcurrentQueueProducerTypelessBase *desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See +// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT; + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = + static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = + static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + static_cast( + Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4307) // + integral constant overflow (that's what + // the ternary expression is for!) +#pragma warning(disable : 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = + (details::const_numeric_max::value - + static_cast(Traits::MAX_SUBQUEUE_SIZE) < + BLOCK_SIZE) + ? details::const_numeric_max::value + : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + + (BLOCK_SIZE - 1)) / + BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), + "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), + "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && + !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & + (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), + "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " + "power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && + !(EXPLICIT_INITIAL_INDEX_SIZE & + (EXPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && + !(IMPLICIT_INITIAL_INDEX_SIZE & + (IMPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert( + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || + !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || + INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at " + "least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be + // allocated up-front, which means only a single producer will be able to + // enqueue elements without an extra allocation -- blocks aren't shared + // between producers). This method is not thread safe -- it is up to the + // user to ensure that the queue is fully constructed before it starts being + // used by other threads (this includes making the memory effects of + // construction visible, possibly with a memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list( + capacity / BLOCK_SIZE + + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, + size_t maxExplicitProducers, + size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * + (maxExplicitProducers + 1) + + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) + { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) + { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) + { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) + { + auto prev = hash->prev; + if (prev != nullptr) + { // The last hash is part of this object and was not allocated + // dynamically + for (size_t i = 0; i != hash->capacity; ++i) + { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) + { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) + { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue &operator=(ConcurrentQueue const &) + MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT + : producerListTail( + other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex( + other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId( + other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load( + std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty + // queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store( + other.explicitProducers.load(std::memory_order_relaxed), + std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store( + other.implicitProducers.load(std::memory_order_relaxed), + std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue &operator=(ConcurrentQueue &&other) + MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue &swap_internal(ConcurrentQueue &other) + { + if (this == &other) + { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, + other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, + other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, + other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const &item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T &&item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const &token, T const &item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. Allocates memory if required. Only fails if memory + // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would + // be surpassed). Thread-safe. + inline bool enqueue(producer_token_t const &token, T &&item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note: + // Use std::make_move_iterator if the elements should be moved instead of + // copied. Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). Thread-safe. + inline bool try_enqueue(T const &item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T &&item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const &token, T const &item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. Does not allocate memory. Fails if not enough room to + // enqueue. Thread-safe. + inline bool try_enqueue(producer_token_t const &token, T &&item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const &token, + It itemFirst, + size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U &item) + { + // Instead of simply trying each producer in turn (which could cause + // needless contention on the first producer), we score them + // heuristically. + size_t nonEmptyCount = 0; + ProducerBase *best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + nonEmptyCount < 3 && ptr != nullptr; + ptr = ptr->next_prod()) + { + auto size = ptr->size_approx(); + if (size > 0) + { + if (size > bestSize) + { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time we try to dequeue from it, we need to make sure every queue's + // been tried + if (nonEmptyCount > 0) + { + if ((details::likely)(best->dequeue(item))) + { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr != best && ptr->dequeue(item)) + { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall + // throughput under contention, but will give more predictable results in + // single-threaded consumer scenarios. This is mostly only useful for + // internal unit tests. Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U &item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr->dequeue(item)) + { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t &token, U &item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the + // global offset) -> this means the highest efficiency consumer dictates + // the rotation speed of everyone else, more or less If you see that the + // global offset has changed, you must reset your consumption counter + // and move to your designated place If there's no items where you're + // supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to + // consume, move over from your current position until you find an + // producer with something in it + + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + if (!update_current_producer_after_rotation(token)) + { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time we try to dequeue from it, we need to make sure every queue's + // been tried + if (static_cast(token.currentProducer)->dequeue(item)) + { + if (++token.itemsConsumedFromCurrent == + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) + { + globalExplicitConsumerOffset.fetch_add( + 1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = + static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) + { + if (ptr->dequeue(item)) + { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) + { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. Returns the number of items actually dequeued. Returns 0 + // if all producer streams appeared empty at the time they were checked (so, + // the queue is likely but not guaranteed to be empty). Never allocates. + // Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + if (!update_current_producer_after_rotation(token)) + { + return 0; + } + } + + size_t count = static_cast(token.currentProducer) + ->dequeue_bulk(itemFirst, max); + if (count == max) + { + if ((token.itemsConsumedFromCurrent += static_cast( + max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) + { + globalExplicitConsumerOffset.fetch_add( + 1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = + static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) + { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) + { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = + static_cast(dequeued); + } + if (dequeued == max) + { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + } + return count; + } + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const &producer, + U &item) + { + return static_cast(producer.producer) + ->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner + // queue. Returns the number of items actually dequeued. If you happen to + // know which producer you want to dequeue from, this is significantly + // faster than using the general-case try_dequeue methods. Returns 0 if the + // producer's queue appeared empty at the time it was checked (so, the queue + // is likely but not guaranteed to be empty). Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer( + producer_token_t const &producer, It itemFirst, size_t max) + { + return static_cast(producer.producer) + ->dequeue_bulk(itemFirst, max); + } + + // Returns an estimate of the total number of elements currently in the + // queue. This estimate is only accurate if the queue has completely + // stabilized before it is called (i.e. all enqueue and dequeue operations + // have completed and their memory effects are visible on the calling + // thread, and no further operations start while this method is being + // called). Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + size += ptr->size_approx(); + } + return size; + } + + bool is_empty() const + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr->size_approx() > 0) + { + return false; + } + } + + return true; + } + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free< + typename details::thread_id_converter:: + thread_id_numeric_size_t>::value == 2; + } + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode + { + CanAlloc, + CannotAlloc + }; + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const &token, U &&element) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue( + std::forward(element)); + } + + template + inline bool inner_enqueue(U &&element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const &token, + It itemFirst, + size_t count) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk< + canAlloc>(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t &token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) + { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = + globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) + { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from + // end -- subtract from count first + std::uint32_t offset = + prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) + { + token.desiredProducer = + static_cast(token.desiredProducer) + ->next_prod(); + if (token.desiredProducer == nullptr) + { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) + { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) + { + token.desiredProducer = + static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) + { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) + { + } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the + // world under heavy contention, but simple and correct (assuming nodes are + // never freed until after the free list is destroyed), and fairly speedy + // under low contention. + template // N must inherit FreeListNode or have the same + // fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) + { + } + FreeList(FreeList &&other) + : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) + { + other.freeListHead.store(nullptr, std::memory_order_relaxed); + } + void swap(FreeList &other) + { + details::swap_relaxed(freeListHead, other.freeListHead); + } + + FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N *node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so + // it's safe to set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, + std::memory_order_acq_rel) == 0) + { + // Oh look! We were the last ones referencing this node, and we + // know we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N *try_get() + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) + { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || + !head->freeListRefs.compare_exchange_strong( + refs, + refs + 1, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at + // zero), which means we can read the next and not worry about + // it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong( + head, + next, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + // Yay, got the node. This means it was on the list, which + // means shouldBeOnFreeList must be false no matter the + // refcount (because nobody else knows it's been taken off + // yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & + SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for + // the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to + // decrease the refcount we increased. Note that we don't need + // to release any memory effects, but we do need to ensure that + // the reference count decrement happens-after the CAS on the + // head. + refs = prevHead->freeListRefs.fetch_sub( + 1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) + { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to + // destroy remaining nodes) + N *head_unsafe() const + { + return freeListHead.load(std::memory_order_relaxed); + } + + private: + inline void add_knowing_refcount_is_zero(N *node) + { + // Since the refcount is zero, and nobody can increase it once it's + // zero (except us, and we run only one copy of this method per node + // at a time, i.e. the single thread case), then we know we can + // safely change the next pointer of the node; however, once the + // refcount is back above zero, then other threads could increase it + // (happens under heavy contention, when the refcount goes to zero + // in between a load and a refcount increment of a node in try_get, + // then back up to something non-zero, then the refcount increment + // is done by the other thread) -- so, if the CAS to add the node to + // the actual list fails, decrease the refcount and leave the add + // operation to the next thread who puts the refcount back at zero + // (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) + { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong( + head, + node, + std::memory_order_release, + std::memory_order_relaxed)) + { + // Hmm, the add failed, but we can only try again when the + // refcount goes back to zero + if (node->freeListRefs.fetch_add( + SHOULD_BE_ON_FREELIST - 1, + std::memory_order_release) == 1) + { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes + // are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext + { + implicit_context = 0, + explicit_context = 1 + }; + + struct Block + { + Block() + : next(nullptr), + elementsCompletelyDequeued(0), + freeListRefs(0), + freeListNext(nullptr), + shouldBeOnFreeList(false), + dynamicallyAllocated(true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) + { + if (!emptyFlags[i].load(std::memory_order_relaxed)) + { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that + // happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else + { + // Check counter + if (elementsCompletelyDequeued.load( + std::memory_order_relaxed) == BLOCK_SIZE) + { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load( + std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit + // context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - + static_cast(i & static_cast( + BLOCK_SIZE - 1))] + .load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - + static_cast( + i & static_cast(BLOCK_SIZE - 1))] + .store(true, std::memory_order_release); + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add( + 1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no + // wrapping and count > 0). Returns true if the block is now empty (does + // not apply in explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, + size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - + static_cast(i & + static_cast(BLOCK_SIZE - 1)) - + count + 1; + for (size_t j = 0; j != count; ++j) + { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add( + count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) + { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, + std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) + { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & + static_cast(BLOCK_SIZE - 1)); + } + inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & + static_cast(BLOCK_SIZE - 1)); + } + + private: + static_assert(std::alignment_of::value <= sizeof(T), + "The queue does not support types with an alignment " + "greater than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + + public: + Block *next; + std::atomic elementsCompletelyDequeued; + std::atomic + emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD + ? BLOCK_SIZE + : 1]; + + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be + // 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void *owner; +#endif + }; + static_assert(std::alignment_of::value >= + std::alignment_of::value, + "Internal error: Blocks must be at least as aligned as the " + "type they are wrapping"); + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats; + +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue *parent_, bool isExplicit_) + : tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() + { + } + + template + inline bool dequeue(U &element) + { + if (isExplicit) + { + return static_cast(this)->dequeue(element); + } + else + { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It &itemFirst, size_t max) + { + if (isExplicit) + { + return static_cast(this)->dequeue_bulk( + itemFirst, max); + } + else + { + return static_cast(this)->dequeue_bulk( + itemFirst, max); + } + } + + inline ProducerBase *next_prod() const + { + return static_cast(next); + } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) + ? static_cast(tail - head) + : 0; + } + + inline index_t getTail() const + { + return tailIndex.load(std::memory_order_relaxed); + } + + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block *tailBlock; + + public: + bool isExplicit; + ConcurrentQueue *parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue *parent_) + : ProducerBase(parent_, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = + details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) + { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index( + 0); // This creates an index with double the number of current + // entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) + { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block *halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) != 0) + { + // The head's not on a block boundary, meaning a block + // somewhere is partially dequeued (or the head block is the + // tail block and was fully dequeued, but the head/tail are + // still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & + (pr_blockIndexSize - 1); + while (details::circular_less_than( + pr_blockIndexEntries[i].base + BLOCK_SIZE, + this->headIndex.load(std::memory_order_relaxed))) + { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than( + pr_blockIndexEntries[i].base, + this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop + // gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do + { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty< + explicit_context>()) + { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) + { + i = static_cast( + this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the + // tail block, we need to stop when we reach the tail index + auto lastValidIndex = + (this->tailIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) == 0 + ? BLOCK_SIZE + : static_cast( + this->tailIndex.load( + std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && + (block != this->tailBlock || i != lastValidIndex)) + { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) + { + auto block = this->tailBlock; + do + { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) + { + destroy(block); + } + else + { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) + { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U &&element) + { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) + { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && + this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty()) + { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block:: + template reset_empty(); + + // We'll put the block on the block index (guaranteed to be + // room since we're conceptually removing the last block + // from it first -- except instead of removing then adding, + // we can just overwrite). Note that there must be a valid + // block index here, since even if allocation failed in the + // ctor, it would have been re-attempted when adding the + // first block to the queue; since there is such a block, a + // block index must have been successfully allocated. + } + else + { + // Whatever head value we see here is >= the last value we + // saw here (relatively), and <= its current value. Since we + // have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than( + currentTailIndex, head)); + if (!details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head))) + { + // We can't enqueue in another block because there's not + // enough leeway -- the tail could surpass the head by + // the time the block fills up! (Or we'll exceed the + // size limit, if the second part of the condition was + // true.) + return false; + } + // We're going to need a new block; check that the block + // index has room + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize) + { + // Hmm, the circular block index is already full -- + // we'll need to allocate a new index. Note + // pr_blockIndexRaw can only be nullptr if the initial + // allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + return false; + } + else if (!new_block_index(pr_blockIndexSlotsUsed)) + { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue:: + template requisition_block(); + if (newBlock == nullptr) + { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (this->tailBlock == nullptr) + { + newBlock->next = newBlock; + } + else + { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + // The constructor may throw. We want the element not to + // appear in the queue in that case (without corrupting the + // queue): + MOODYCAMEL_TRY + { + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + // Revert change to the current block, but leave the new + // block available for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? this->tailBlock + : startBlock; + MOODYCAMEL_RETHROW; + } + } + else + { + (void) startBlock; + (void) originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed) + ->front.store(pr_blockIndexFront, + std::memory_order_release); + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U &element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load( + std::memory_order_relaxed) - + overcommit, + tail)) + { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the + // common case when the queue is empty and the values are + // eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, + // they are not going to change (unless we change them) and must + // be the same value at this point (inside the if) as when the + // if condition was evaluated. + + // We insert an acquire fence here to synchronize-with the + // release upon incrementing dequeueOvercommit below. This + // ensures that whatever the value we got loaded into + // overcommit, the load of dequeueOptisticCount in the fetch_add + // below will result in a value at least as recent as that (and + // therefore at least as large). Note that I believe a compiler + // (signal) fence here would be sufficient due to the nature of + // fetch_add (all read-modify-write operations are guaranteed to + // work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only + // the C++11 standard. See + // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the + // boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= + // dequeueOptimisticCount (because dequeueOvercommit is only + // ever incremented after dequeueOptimisticCount -- this is + // enforced in the `else` block below), and since we now have a + // version of dequeueOptimisticCount that is at least as recent + // as overcommit (due to the release upon incrementing + // dequeueOvercommit and the acquire above that synchronizes + // with it), overcommit <= myDequeueCount. However, we can't + // assert this since both dequeueOptimisticCount and + // dequeueOvercommit may (independently) overflow; in such a + // case, though, the logic still holds since the difference + // between the two is maintained. + + // Note that we reload tail here in case it changed; it will be + // the same value as before or greater, since this load is + // sequenced after (happens after) the earlier load above. This + // is supported by read-read coherency (as defined in the + // standard), explained here: + // http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than( + myDequeueCount - overcommit, tail))) + { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be + // at least one element, this will never exceed tail. We + // need to do an acquire-release fence here since it's + // possible that whatever condition got us to this point was + // for an earlier enqueued element (that we already see the + // memory effects for), but that by the time we increment + // somebody else has incremented it, and we need to see the + // memory effects for *that* element, which is in such a + // case is necessarily visible on the thread that + // incremented it in the first place with the more current + // condition (they must have acquired a tail that is at + // least as recent). + auto index = + this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + + auto localBlockIndex = + blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing + // because of index wrap-around. When an index wraps, we + // need to preserve the sign of the offset when dividing it + // by the block size (in order to get a correct signed block + // count offset in all cases): + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = + index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + blockBaseIndex - headBase) / + BLOCK_SIZE); + auto block = localBlockIndex + ->entries[(localBlockIndexHead + offset) & + (localBlockIndex->size - 1)] + .block; + + // Dequeue + auto &el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T &&, element = std::move(el))) + { + // Make sure the element is still fully dequeued and + // destroyed even if the assignment throws + struct Guard + { + Block *block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block:: + template set_empty(index); + } + } guard = {block, index}; + + element = std::move(el); // NOLINT + } + else + { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty< + explicit_context>(index); + } + + return true; + } + else + { + // Wasn't anything to dequeue after all; make the effective + // dequeue count eventually consistent + this->dequeueOvercommit.fetch_add( + 1, + std::memory_order_release); // Release so that the + // fetch_add on + // dequeueOptimisticCount + // is guaranteed to happen + // before this write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of + // the elements; this means pre-allocating blocks and putting them + // in the block index (but only if all the allocations succeeded). + index_t startTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block *firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) + { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && + this->tailBlock->next != firstAllocatedBlock && + this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty()) + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than( + currentTailIndex, head)); + bool full = + !details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize || full) + { + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + // Failed to allocate, undo changes (but keep + // injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = + originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + else if (full || + !new_block_index(originalBlockIndexSlotsUsed)) + { + // Failed to allocate, undo changes (but keep + // injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = + originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, + // so we need to update our fallback value too (since we + // keep the new index even if we later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue:: + template requisition_block(); + if (newBlock == nullptr) + { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty< + explicit_context>(); + if (this->tailBlock == nullptr) + { + newBlock->next = newBlock; + } + else + { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's + // emptiness before we fill them up, and publish the new block + // index front + auto block = firstAllocatedBlock; + while (true) + { + block->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (block == this->tailBlock) + { + break; + } + block = block->next; + } + + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + blockIndex.load(std::memory_order_relaxed) + ->front.store( + (pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), + std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != + 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) + { + this->tailBlock = firstAllocatedBlock; + } + while (true) + { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, + stopIndex)) + { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex++]) + T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) + { + // Must use copy constructor even if move + // constructor is available because we may have to + // revert if there's an exception. Sorry about the + // horrible templated next line, but it was the only + // way to disable moving *at compile time*, which is + // important because a type may only define a + // (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if + // branch that will never be executed + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if(nullptr)) + T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + // Oh dear, an exception's been thrown -- destroy the + // elements that were enqueued so far and revert the + // entire bulk operation (we'll keep any allocated + // blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + + if (!details::is_trivially_destructible::value) + { + auto block = startBlock; + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) == 0) + { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) + { + stopIndex = + (currentTailIndex & + ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than( + constructedStopIndex, stopIndex)) + { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) + { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) + { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) + { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + if (firstAllocatedBlock != nullptr) + blockIndex.load(std::memory_order_relaxed) + ->front.store( + (pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), + std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It &itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - + (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) + { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) + { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) + { + this->dequeueOvercommit.fetch_add( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed + // to be at least actualCount elements, this will never + // exceed tail. + auto firstIndex = this->headIndex.fetch_add( + actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = + blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = + firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + firstBlockBaseIndex - headBase) / + BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & + (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do + { + auto firstIndexInBlock = index; + index_t endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, + T &&, + details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else + { + MOODYCAMEL_TRY + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + // It's too late to revert the dequeue, but we + // can make sure that all the dequeued objects + // are properly destroyed and the block index + // (and empty count) are properly updated before + // we propagate the exception + do + { + block = localBlockIndex->entries[indexIndex] + .block; + while (index != endIndex) + { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block:: + template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast( + endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & + (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast( + BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast( + actualCount), + endIndex) + ? firstIndex + static_cast( + actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast(endIndex - firstIndexInBlock)); + indexIndex = + (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else + { + // Wasn't anything to dequeue after all; make the effective + // dequeue count eventually consistent + this->dequeueOvercommit.fetch_add( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block *block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic + front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry *entries; + void *prev; + }; + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast( + (Traits::malloc)(sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) + { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast( + details::align_for(newRawPtr + + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) + { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & + prevBlockSizeMask; + do + { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, + std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the + // old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in + // referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry *pr_blockIndexEntries; + void *pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer *nextExplicitProducer; + + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue *parent_) + : ProducerBase(parent_, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all + // enqueue/dequeue operations completed already; this means that all + // undequeued elements are placed contiguously across contiguous + // blocks, and that only the first and last remaining blocks can be + // only partially empty (all other remaining blocks must be + // completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) + { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block *block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = + index != tail; // If we enter the loop, then the last (tail) + // block will not be freed + while (index != tail) + { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || + block == nullptr) + { + if (block != nullptr) + { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load( + std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on + // the free list (unless the head index reached the end of it, in + // which case the tail will be poised to create a new block). + if (this->tailBlock != nullptr && + (forceFreeLastBlock || + (tail & static_cast(BLOCK_SIZE - 1)) != 0)) + { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) + { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) + { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do + { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U &&element) + { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) + { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, + head)); + if (!details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head))) + { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block + // index + BlockIndexEntry *idxEntry; + if (!insert_block_index_entry(idxEntry, + currentTailIndex)) + { + return false; + } + + // Get ahold of a new block + auto newBlock = + this->parent->ConcurrentQueue::template requisition_block< + allocMode>(); + if (newBlock == nullptr) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context>(); + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + // May throw, try to insert now before we publish the fact + // that we have this new block + MOODYCAMEL_TRY + { + new ((*newBlock)[currentTailIndex]) + T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U &element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load( + std::memory_order_relaxed) - + overcommit, + tail)) + { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than( + myDequeueCount - overcommit, tail))) + { + index_t index = + this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto &el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T &&, element = std::move(el))) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead + // of only when a block is released is very sub-optimal, + // but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard + { + Block *block; + index_t index; + BlockIndexEntry *entry; + ConcurrentQueue *parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block:: + template set_empty( + index)) + { + entry->value.store( + nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = {block, index, entry, this->parent}; + + element = std::move(el); // NOLINT + } + else + { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty< + implicit_context>(index)) + { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool + // (and remove from block index) + entry->value.store(nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + } + + return true; + } + else + { + this->dequeueOvercommit.fetch_add( + 1, std::memory_order_release); + } + } + + return false; + } + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4706) // assignment within conditional expression +#endif + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of + // the elements; this means pre-allocating blocks and putting them + // in the block index (but only if all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us + // any more; this happens if it was filled up exactly to the top + // (setting tailIndex to the first index of the next block which is + // not yet allocated), then dequeued completely (putting it on the + // free list) before we enqueue again. + + index_t startTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block *firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block + // index + BlockIndexEntry *idxEntry = + nullptr; // initialization here unnecessary but + // compiler can't always tell + Block *newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than( + currentTailIndex, head)); + bool full = + !details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head)); + + if (full || + !(indexInserted = insert_block_index_entry( + idxEntry, currentTailIndex)) || + (newBlock = + this->parent->ConcurrentQueue:: + template requisition_block()) == + nullptr) + { + // Index allocation or block allocation failed; revert + // any other allocations and index insertions done so + // far for this operation + if (indexInserted) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + } + currentTailIndex = + (startTailIndex - 1) & + ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) + { + currentTailIndex += + static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index( + currentTailIndex); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list( + firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context>(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later + // allocations fail, and so that we can find the blocks when + // we do the actual enqueueing + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) != 0 || + firstAllocatedBlock != nullptr) + { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? newBlock + : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != + 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) + { + this->tailBlock = firstAllocatedBlock; + } + while (true) + { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, + stopIndex)) + { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex++]) + T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if(nullptr)) + T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) + { + auto block = startBlock; + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) == 0) + { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) + { + stopIndex = + (currentTailIndex & + ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than( + constructedStopIndex, stopIndex)) + { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) + { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) + { + break; + } + block = block->next; + } + } + + currentTailIndex = + (startTailIndex - 1) & + ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) + { + currentTailIndex += + static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index( + currentTailIndex); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list( + firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) + { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + template + size_t dequeue_bulk(It &itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - + (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) + { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) + { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) + { + this->dequeueOvercommit.fetch_add( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed + // to be at least actualCount elements, this will never + // exceed tail. + auto firstIndex = this->headIndex.fetch_add( + actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader *localBlockIndex; + auto indexIndex = + get_block_index_index_for_index(index, localBlockIndex); + do + { + auto blockStartIndex = index; + index_t endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = + entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, + T &&, + details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else + { + MOODYCAMEL_TRY + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + do + { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load( + std::memory_order_relaxed); + while (index != endIndex) + { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block:: + template set_many_empty< + implicit_context>( + blockStartIndex, + static_cast( + endIndex - + blockStartIndex))) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store( + nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list( + block); + } + indexIndex = + (indexIndex + 1) & + (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast( + BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast( + actualCount), + endIndex) + ? firstIndex + static_cast( + actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block:: + template set_many_empty( + blockStartIndex, + static_cast(endIndex - + blockStartIndex))) + { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a + // release, meaning that anybody who acquires + // the block we're about to free can use it + // safely since our writes (and reads!) will + // have happened-before then. + entry->value.store(nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + indexIndex = + (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else + { + this->dequeueOvercommit.fetch_add( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an + // invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry *entries; + BlockIndexEntry **index; + BlockIndexHeader *prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry, + index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load( + std::memory_order_relaxed); // We're the only writer thread, + // relaxed is OK + if (localBlockIndex == nullptr) + { + return false; // this can happen if new_block_index failed in + // the constructor + } + size_t newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == + INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) + { + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + return false; + } + else if (!new_block_index()) + { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == + INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store( + (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & + (localBlockIndex->capacity - 1), + std::memory_order_relaxed); + } + + inline BlockIndexEntry *get_block_index_entry_for_index( + index_t index) const + { + BlockIndexHeader *localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index( + index_t index, BlockIndexHeader *&localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load( + std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may + // wrap around, causing a negative offset, whose negativity we want + // to preserve + auto offset = static_cast( + static_cast::type>( + index - tailBase) / + BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load( + std::memory_order_relaxed) == index && + localBlockIndex->index[idx]->value.load( + std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = + prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry *) * nextBlockIndexCapacity)); + if (raw == nullptr) + { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast( + details::align_for(raw + + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast( + details::align_for( + reinterpret_cast(entries) + + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) + { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do + { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) + { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, + std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store( + (prevCapacity - 1) & (nextBlockIndexCapacity - 1), + std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer *nextImplicitProducer; + + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) + { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) + { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) + { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block *try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= + initialBlockPoolSize) + { + return nullptr; + } + + auto index = + initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) + : nullptr; + } + + inline void add_block_to_free_list(Block *block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block *block) + { + while (block != nullptr) + { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block *try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one + // (if applicable) + template + Block *requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) + { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) + { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) + { + return create(); + } + else + { + return nullptr; + } + } + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats + { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue *q) + { + MemStats stats = {0}; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) + { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + bool implicit = + dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) + { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = + prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) + { + for (size_t i = 0; i != hash->capacity; ++i) + { + if (hash->index[i]->key.load( + std::memory_order_relaxed) != + ImplicitProducer::INVALID_BLOCK_BASE && + hash->index[i]->value.load( + std::memory_order_relaxed) != nullptr) + { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += + hash->capacity * + sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) + { + stats.implicitBlockIndexBytes += + sizeof(typename ImplicitProducer:: + BlockIndexHeader) + + hash->capacity * + sizeof(typename ImplicitProducer:: + BlockIndexEntry *); + } + } + for (; details::circular_less_than(head, tail); + head += BLOCK_SIZE) + { + // auto block = + // prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else + { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) + { + auto block = tailBlock; + do + { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block:: + template is_empty() || + wasNonEmpty) + { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = + prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) + { + stats.explicitBlockIndexBytes += + sizeof( + typename ExplicitProducer::BlockIndexHeader) + + index->size * + sizeof( + typename ExplicitProducer::BlockIndexEntry); + index = static_cast< + typename ExplicitProducer::BlockIndexHeader *>( + index->prev); + } + } + } + + auto freeOnInitialPool = + q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= + q->initialBlockPoolSize + ? 0 + : q->initialBlockPoolSize - q->initialBlockPoolIndex.load( + std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + +private: + friend struct MemStats; +#endif + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase *recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase *recycle_or_create_producer(bool isExplicit, bool &recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr->inactive.load(std::memory_order_relaxed) && + ptr->isExplicit == isExplicit) + { + bool expected = true; + if (ptr->inactive.compare_exchange_strong( + expected, + /* desired */ false, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + // We caught one! It's been marked as activated, the caller + // can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast( + create(this)) + : create(this)); + } + + ProducerBase *add_producer(ProducerBase *producer) + { + // Handle failed memory allocation + if (producer == nullptr) + { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do + { + producer->next = prevTail; + } while ( + !producerListTail.compare_exchange_weak(prevTail, + producer, + std::memory_order_release, + std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) + { + auto prevTailExplicit = + explicitProducers.load(std::memory_order_relaxed); + do + { + static_cast(producer) + ->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak( + prevTailExplicit, + static_cast(producer), + std::memory_order_release, + std::memory_order_relaxed)); + } + else + { + auto prevTailImplicit = + implicitProducers.load(std::memory_order_relaxed); + do + { + static_cast(producer) + ->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak( + prevTailImplicit, + static_cast(producer), + std::memory_order_release, + std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); + ptr != nullptr; + ptr = ptr->next_prod()) + { + ptr->parent = this; + } + } + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer + *value; // No need for atomicity since it's only read by the thread + // that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) + { + } + + ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), + std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP &operator=(ImplicitProducerKVP &&other) + MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) + { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap( + typename ConcurrentQueue::ImplicitProducerKVP &, + typename ConcurrentQueue::ImplicitProducerKVP &) + MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP *entries; + ImplicitProducerHash *prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + { + return; + } + else + { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) + { + initialImplicitProducerHashEntries[i].key.store( + details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue &other) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + { + return; + } + else + { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap( + other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = + &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = + &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, + other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, + other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == + &other.initialImplicitProducerHash) + { + implicitProducerHash.store(&initialImplicitProducerHash, + std::memory_order_relaxed); + } + else + { + ImplicitProducerHash *hash; + for (hash = + implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &other.initialImplicitProducerHash; + hash = hash->prev) + { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == + &initialImplicitProducerHash) + { + other.implicitProducerHash.store( + &other.initialImplicitProducerHash, + std::memory_order_relaxed); + } + else + { + ImplicitProducerHash *hash; + for (hash = other.implicitProducerHash.load( + std::memory_order_relaxed); + hash->prev != &initialImplicitProducerHash; + hash = hash->prev) + { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer *get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread + // ID), there's a reduced need for fences (memory ordering is already + // consistent for any individual thread), except for the current table + // itself. + + // Start by looking for the thread ID in the current and all previous + // hash tables. If it's not found, it must not be in there yet, since + // this same thread would have added it previously to one of the tables + // that we traversed. + + // Code and algorithm adapted from + // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings + // (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) + { + // Look for the id in this hash + auto index = hashedId; + while (true) + { // Not an infinite loop because at least one slot is free in the + // hash table + index &= hash->capacity - 1; + + auto probedKey = + hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) + { + // Found it! If we had to search several hashes deep, + // though, we should lazily add it to the current main hash + // table to avoid the extended search next time. Note + // there's guaranteed to be room in the current hash table + // since every subsequent table implicitly reserves space + // for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) + { + index = hashedId; + while (true) + { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load( + std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && + mainHash->entries[index] + .key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed)) || + (probedKey == reusable && + mainHash->entries[index] + .key.compare_exchange_strong( + reusable, + id, + std::memory_order_acquire, + std::memory_order_acquire))) + { +#else + if ((probedKey == empty && + mainHash->entries[index] + .key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed))) + { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) + { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add( + 1, std::memory_order_relaxed); + while (true) + { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && + !implicitProducerHashResizeInProgress.test_and_set( + std::memory_order_acquire)) + { + // We've acquired the resize lock, try to allocate a bigger hash + // table. Note the acquire fence synchronizes with the release + // fence at the end of this block, and hence when we reload + // implicitProducerHash it must be the most recent version (it + // only gets changed within this locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) + { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) + { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)( + sizeof(ImplicitProducerHash) + + std::alignment_of::value - 1 + + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) + { + // Allocation failed + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear( + std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = static_cast(newCapacity); + newHash->entries = reinterpret_cast( + details::align_for( + raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) + { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store( + details::invalid_thread_id, + std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, + std::memory_order_release); + implicitProducerHashResizeInProgress.clear( + std::memory_order_release); + mainHash = newHash; + } + else + { + implicitProducerHashResizeInProgress.clear( + std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that + // we don't have to wait for the next table to finish being + // allocated by another thread (and if we just finished allocating + // above, the condition will always be true) + if (newCount < + (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) + { + bool recycled; + auto producer = static_cast( + recycle_or_create_producer(false, recycled)); + if (producer == nullptr) + { + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) + { + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = + &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe( + &producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) + { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load( + std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && + mainHash->entries[index].key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed)) || + (probedKey == reusable && + mainHash->entries[index].key.compare_exchange_strong( + reusable, + id, + std::memory_order_acquire, + std::memory_order_acquire))) + { +#else + if ((probedKey == empty && + mainHash->entries[index].key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed))) + { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy + // allocating a new one. We need to wait for the allocating thread + // to finish (if it succeeds, we add, if not, we try to allocate + // ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer *producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != + nullptr); // The thread exit listener is only registered if we + // were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't + // on the current one yet and are trying to add an entry thinking + // there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) + { + auto index = hashedId; + do + { + index &= hash->capacity - 1; + probedKey = + hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) + { + hash->entries[index].key.store(details::invalid_thread_id2, + std::memory_order_release); + break; + } + ++index; + } while (probedKey != + details::invalid_thread_id); // Can happen if the hash has + // changed but we weren't put + // back in it yet, or if we + // weren't added to this hash + // in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void *userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline void *aligned_malloc(size_t size) + { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= + std::alignment_of::value) + return (Traits::malloc)(size); + else + { + size_t alignment = std::alignment_of::value; + void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *)); + if (!raw) + return nullptr; + char *ptr = details::align_for( + reinterpret_cast(raw) + sizeof(void *)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + } + + template + static inline void aligned_free(void *ptr) + { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= + std::alignment_of::value) + return (Traits::free)(ptr); + else(Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) + : nullptr); + } + + template + static inline U *create_array(size_t count) + { + assert(count > 0); + U *p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template + static inline void destroy_array(U *p, size_t count) + { + if (p != nullptr) + { + assert(count > 0); + for (size_t i = count; i != 0;) + (p + --i)->~U(); + } + aligned_free(p); + } + + template + static inline U *create() + { + void *p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U *create(A1 &&a1) + { + void *p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U *p) + { + if (p != nullptr) + p->~U(); + aligned_free(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block *initialBlockPool; + size_t initialBlockPoolSize; + +#ifndef MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic + implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array + initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + +template +ProducerToken::ProducerToken(ConcurrentQueue &queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) + { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue &queue) + : producer(reinterpret_cast *>(&queue) + ->recycle_or_create_producer(true)) +{ + if (producer != nullptr) + { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue &queue) + : itemsConsumedFromCurrent(0), + currentProducer(nullptr), + desiredProducer(nullptr) +{ + initialOffset = + queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue &queue) + : itemsConsumedFromCurrent(0), + currentProducer(nullptr), + desiredProducer(nullptr) +{ + initialOffset = + reinterpret_cast *>(&queue) + ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +inline void swap(ConcurrentQueue &a, + ConcurrentQueue &b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} // namespace moodycamel + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +#pragma warning(pop) +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/src/bthread/parking_lot.cpp b/src/bthread/parking_lot.cpp new file mode 100644 index 0000000000..76ab2b319a --- /dev/null +++ b/src/bthread/parking_lot.cpp @@ -0,0 +1,7 @@ +#include "parking_lot.h" + +namespace bthread { + +butil::atomic ParkingLot::_waiting_worker_count{0}; + +} // namespace bthread \ No newline at end of file diff --git a/src/bthread/parking_lot.h b/src/bthread/parking_lot.h index d42a560e4d..8c4d9c8c8d 100644 --- a/src/bthread/parking_lot.h +++ b/src/bthread/parking_lot.h @@ -57,7 +57,9 @@ class BAIDU_CACHELINE_ALIGNMENT ParkingLot { // Wait for tasks. // If the `expected_state' does not match, wait() may finish directly. void wait(const State& expected_state) { + _waiting_worker_count ++; futex_wait_private(&_pending_signal, expected_state.val, NULL); + _waiting_worker_count --; } // Wakeup suspended wait() and make them unwaitable ever. @@ -65,6 +67,9 @@ class BAIDU_CACHELINE_ALIGNMENT ParkingLot { _pending_signal.fetch_or(1); futex_wake_private(&_pending_signal, 10000); } + + static butil::atomic _waiting_worker_count; + private: // higher 31 bits for signalling, LSB for stopping. butil::atomic _pending_signal; diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index cbae7c5bfa..5a61b3739f 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -39,6 +39,9 @@ namespace bthread { +std::atomic TaskGroup::_resume_rq_cnt{0}; +moodycamel::ConcurrentQueue TaskGroup::_resume_rq(10000); + static const bthread_attr_t BTHREAD_ATTR_TASKGROUP = { BTHREAD_STACKTYPE_UNKNOWN, 0, NULL }; @@ -116,12 +119,25 @@ bool TaskGroup::is_stopped(bthread_t tid) { } bool TaskGroup::wait_task(bthread_t* tid) { + int64_t wait_begin_ms = butil::cpuwide_time_ms(); do { #ifndef BTHREAD_DONT_SAVE_PARKING_STATE if (_last_pl_state.stopped()) { return false; } + + if (pop_resume_task(tid)) { + return true; + } + if (steal_task(tid)) { + return true; + } + if(butil::cpuwide_time_ms() - wait_begin_ms <= 5000){ + continue; + } + _pl->wait(_last_pl_state); + wait_begin_ms = butil::cpuwide_time_ms(); if (steal_task(tid)) { return true; } @@ -192,6 +208,7 @@ TaskGroup::TaskGroup(TaskControl* c) #ifndef NDEBUG , _sched_recursive_guard(0) #endif + ,_resume_consumer_token(_resume_rq) { _steal_seed = butil::fast_rand(); _steal_offset = OFFSET_TABLE[_steal_seed % ARRAY_SIZE(OFFSET_TABLE)]; @@ -513,18 +530,21 @@ TaskStatistics TaskGroup::main_stat() const { void TaskGroup::ending_sched(TaskGroup** pg) { TaskGroup* g = *pg; bthread_t next_tid = 0; - // Find next task to run, if none, switch to idle thread of the group. + + if (!g->pop_resume_task(&next_tid)) { + // Find next task to run, if none, switch to idle thread of the group. #ifndef BTHREAD_FAIR_WSQ - // When BTHREAD_FAIR_WSQ is defined, profiling shows that cpu cost of - // WSQ::steal() in example/multi_threaded_echo_c++ changes from 1.9% - // to 2.9% - const bool popped = g->_rq.pop(&next_tid); + // When BTHREAD_FAIR_WSQ is defined, profiling shows that cpu cost of + // WSQ::steal() in example/multi_threaded_echo_c++ changes from 1.9% + // to 2.9% + const bool popped = g->_rq.pop(&next_tid); #else - const bool popped = g->_rq.steal(&next_tid); + const bool popped = g->_rq.steal(&next_tid); #endif - if (!popped && !g->steal_task(&next_tid)) { - // Jump to main task if there's no task to run. - next_tid = g->_main_tid; + if (!popped && !g->steal_task(&next_tid)) { + // Jump to main task if there's no task to run. + next_tid = g->_main_tid; + } } TaskMeta* const cur_meta = g->_cur_meta; @@ -554,15 +574,18 @@ void TaskGroup::ending_sched(TaskGroup** pg) { void TaskGroup::sched(TaskGroup** pg) { TaskGroup* g = *pg; bthread_t next_tid = 0; - // Find next task to run, if none, switch to idle thread of the group. + + if (!g->pop_resume_task(&next_tid)) { + // Find next task to run, if none, switch to idle thread of the group. #ifndef BTHREAD_FAIR_WSQ - const bool popped = g->_rq.pop(&next_tid); + const bool popped = g->_rq.pop(&next_tid); #else - const bool popped = g->_rq.steal(&next_tid); + const bool popped = g->_rq.steal(&next_tid); #endif - if (!popped && !g->steal_task(&next_tid)) { - // Jump to main task if there's no task to run. - next_tid = g->_main_tid; + if (!popped && !g->steal_task(&next_tid)) { + // Jump to main task if there's no task to run. + next_tid = g->_main_tid; + } } sched_to(pg, next_tid); } @@ -652,7 +675,7 @@ void TaskGroup::destroy_self() { void TaskGroup::ready_to_run(bthread_t tid, bool nosignal) { push_rq(tid); - if (nosignal) { + if (nosignal || ParkingLot::_waiting_worker_count == 0) { ++_num_nosignal; } else { const int additional_signal = _num_nosignal; @@ -672,24 +695,37 @@ void TaskGroup::flush_nosignal_tasks() { } void TaskGroup::ready_to_run_remote(bthread_t tid, bool nosignal) { - _remote_rq._mutex.lock(); - while (!_remote_rq.push_locked(tid)) { - flush_nosignal_tasks_remote_locked(_remote_rq._mutex); - LOG_EVERY_SECOND(ERROR) << "_remote_rq is full, capacity=" - << _remote_rq.capacity(); + while (!push_resume_task(tid)) { + LOG_EVERY_SECOND(ERROR) << "push_resume_rq fail"; ::usleep(1000); - _remote_rq._mutex.lock(); } - if (nosignal) { + if (nosignal || ParkingLot::_waiting_worker_count == 0) { ++_remote_num_nosignal; - _remote_rq._mutex.unlock(); } else { const int additional_signal = _remote_num_nosignal; _remote_num_nosignal = 0; _remote_nsignaled += 1 + additional_signal; - _remote_rq._mutex.unlock(); _control->signal_task(1 + additional_signal); } + + // _remote_rq._mutex.lock(); + // while (!_remote_rq.push_locked(tid)) { + // flush_nosignal_tasks_remote_locked(_remote_rq._mutex); + // LOG_EVERY_SECOND(ERROR) << "_remote_rq is full, capacity=" + // << _remote_rq.capacity(); + // ::usleep(1000); + // _remote_rq._mutex.lock(); + // } + // if (nosignal) { + // ++_remote_num_nosignal; + // _remote_rq._mutex.unlock(); + // } else { + // const int additional_signal = _remote_num_nosignal; + // _remote_num_nosignal = 0; + // _remote_nsignaled += 1 + additional_signal; + // _remote_rq._mutex.unlock(); + // _control->signal_task(1 + additional_signal); + // } } void TaskGroup::flush_nosignal_tasks_remote_locked(butil::Mutex& locked_mutex) { diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index 2a1bb2a93d..8e1193501f 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -30,6 +30,8 @@ #include "butil/resource_pool.h" // ResourceId #include "bthread/parking_lot.h" +#include "thirdparty/moodycamelqueue.h" + namespace bthread { // For exiting a bthread. @@ -182,6 +184,9 @@ class TaskGroup { // process make go on indefinitely. void push_rq(bthread_t tid); + bool pop_resume_task(bthread_t* tid); + bool push_resume_task(bthread_t tid); + private: friend class TaskControl; @@ -249,6 +254,10 @@ friend class TaskControl; int _remote_nsignaled; int _sched_recursive_guard; + + static std::atomic _resume_rq_cnt; + static moodycamel::ConcurrentQueue _resume_rq; + moodycamel::ConsumerToken _resume_consumer_token; }; } // namespace bthread diff --git a/src/bthread/task_group_inl.h b/src/bthread/task_group_inl.h index 45626ceb49..de42add385 100644 --- a/src/bthread/task_group_inl.h +++ b/src/bthread/task_group_inl.h @@ -97,6 +97,27 @@ inline void TaskGroup::push_rq(bthread_t tid) { } } +inline bool TaskGroup::pop_resume_task(bthread_t* tid) { + int tmp_cnt = _resume_rq_cnt.load(std::memory_order_relaxed); + if (tmp_cnt>0 && _resume_rq_cnt.compare_exchange_strong(tmp_cnt, tmp_cnt-1)){ + if(_resume_rq.try_dequeue(_resume_consumer_token, *tid)){ + return true; + } + else { + _resume_rq_cnt ++; + } + } + return false; +} + +inline bool TaskGroup::push_resume_task(bthread_t tid){ + if(_resume_rq.enqueue(tid)){ + _resume_rq_cnt ++; + return true; + } + return false; +} + inline void TaskGroup::flush_nosignal_tasks_remote() { if (_remote_num_nosignal) { _remote_rq._mutex.lock(); diff --git a/src/thirdparty/moodycamelqueue.h b/src/thirdparty/moodycamelqueue.h new file mode 100644 index 0000000000..d0d042f6b3 --- /dev/null +++ b/src/thirdparty/moodycamelqueue.h @@ -0,0 +1,5255 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free +// queue. An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2020, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +// Also dual-licensed under the Boost Software License (see LICENSE.md) + +#pragma once + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing +// warnings upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +// VS2019 with /W4 warns about constant conditional expressions but unless +// /std=c++17 or higher does not support `if constexpr`, so we have no choice +// but to simply disable the warning +#pragma warning(push) +#pragma warning(disable : 4127) // conditional expression is constant +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` +// method declarations. We'll override the default trait malloc ourselves +// without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include +#include +#include // for CHAR_BIT +#include // for max_align_t +#include +#include +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading +#include +#include + +// Platform-specific definitions of a numeric thread ID type and an invalid +// value +namespace moodycamel +{ +namespace details +{ +template +struct thread_id_converter +{ + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const &x) + { + return x; + } +}; +} // namespace details +} // namespace moodycamel +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel +{ +namespace details +{ +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; +static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; +static inline thread_id_t thread_id() +{ + return rl::thread_index(); +} +} // namespace details +} // namespace moodycamel +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the +// function we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId( + void); +namespace moodycamel +{ +namespace details +{ +static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), + "Expected size of unsigned long to be 32 bits on Windows"); +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = + 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx +static const thread_id_t invalid_thread_id2 = + 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used + // in practice. Note that all Win32 thread IDs are presently + // multiples of 4. +static inline thread_id_t thread_id() +{ + return static_cast(::GetCurrentThreadId()); +} +} // namespace details +} // namespace moodycamel +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ + (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel +{ +namespace details +{ +static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, + "std::thread::id is expected to be either 4 or 8 bytes"); + +typedef std::thread::id thread_id_t; +static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + +// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have +// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined +// anyway, which it won't be. +static inline thread_id_t thread_id() +{ + return std::this_thread::get_id(); +} + +template +struct thread_id_size +{ +}; +template <> +struct thread_id_size<4> +{ + typedef std::uint32_t numeric_t; +}; +template <> +struct thread_id_size<8> +{ + typedef std::uint64_t numeric_t; +}; + +template <> +struct thread_id_converter +{ + typedef thread_id_size::numeric_t + thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const &x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } +}; +} +} +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a +// thread-local static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel +{ +namespace details +{ +typedef std::uintptr_t thread_id_t; +static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr +static const thread_id_t invalid_thread_id2 = + 1; // Member accesses off a null pointer are also generally invalid. Plus + // it's not aligned. +inline thread_id_t thread_id() +{ + static MOODYCAMEL_THREADLOCAL int x; + return reinterpret_cast(&x); +} +} +} +#endif + +// Constexpr if +#ifndef MOODYCAMEL_CONSTEXPR_IF +#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \ + __cplusplus > 201402L +#define MOODYCAMEL_CONSTEXPR_IF if constexpr +#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] +#else +#define MOODYCAMEL_CONSTEXPR_IF if +#define MOODYCAMEL_MAYBE_UNUSED +#endif +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || \ + (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ + (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw(expr) +#else +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when +// it shouldn't :-( We have to assume *all* non-trivial constructors may throw +// on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value \ + : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value || \ + std::is_nothrow_move_constructible::value \ + : std::is_trivially_copy_constructible::value || \ + std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a +// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't +// support thread_local either. Finally, iOS/ARM doesn't have support for it +// either, and g++/ARM allows it to compile but it's unconfirmed to actually +// work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ + (!defined(__MINGW32__) && !defined(__MINGW64__) || \ + !defined(__WINPTHREADS_VERSION)) && \ + (!defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ + (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ + !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 +// compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now +// since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link +// error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +namespace moodycamel +{ +namespace details +{ +#ifndef MOODYCAMEL_ALIGNAS +// VS2013 doesn't support alignas or alignof, and align() requires a constant +// literal +#if defined(_MSC_VER) && _MSC_VER <= 1800 +#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) +#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ + typename details::Vs2013Aligned::value, T>::type +template +struct Vs2013Aligned +{ +}; // default, unsupported alignment +template +struct Vs2013Aligned<1, T> +{ + typedef __declspec(align(1)) T type; +}; +template +struct Vs2013Aligned<2, T> +{ + typedef __declspec(align(2)) T type; +}; +template +struct Vs2013Aligned<4, T> +{ + typedef __declspec(align(4)) T type; +}; +template +struct Vs2013Aligned<8, T> +{ + typedef __declspec(align(8)) T type; +}; +template +struct Vs2013Aligned<16, T> +{ + typedef __declspec(align(16)) T type; +}; +template +struct Vs2013Aligned<32, T> +{ + typedef __declspec(align(32)) T type; +}; +template +struct Vs2013Aligned<64, T> +{ + typedef __declspec(align(64)) T type; +}; +template +struct Vs2013Aligned<128, T> +{ + typedef __declspec(align(128)) T type; +}; +template +struct Vs2013Aligned<256, T> +{ + typedef __declspec(align(256)) T type; +}; +#else +template +struct identity +{ + typedef T type; +}; +#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) +#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ + alignas(alignof(obj)) typename details::identity::type +#endif +#endif +} // namespace details +} // namespace moodycamel + +// TSAN can false report races in lock-free code. To enable TSAN to be used +// from projects that use this one, we can apply per-function compile-time +// suppression. See +// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer +#define MOODYCAMEL_NO_TSAN +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#undef MOODYCAMEL_NO_TSAN +#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) +#endif // TSAN +#endif // TSAN + +// Compiler-specific likely/unlikely hints +namespace moodycamel +{ +namespace details +{ +#if defined(__GNUC__) +static inline bool(likely)(bool x) +{ + return __builtin_expect((x), true); +} +static inline bool(unlikely)(bool x) +{ + return __builtin_expect((x), false); +} +#else +static inline bool(likely)(bool x) +{ + return x; +} +static inline bool(unlikely)(bool x) +{ + return x; +} +#endif +} // namespace details +} // namespace moodycamel + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel +{ +namespace details +{ +template +struct const_numeric_max +{ + static_assert(std::is_integral::value, + "const_numeric_max can only be used with integers"); + static const T value = + std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - + static_cast(1) + : static_cast(-1); +}; + +#if defined(__GLIBCXX__) +typedef ::max_align_t + std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else +typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can + // *only* be accessed via std:: +#endif + +// Some platforms have incorrectly set max_align_t to a type with <8 bytes +// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit +// iOS). Work around this with our own union. See issue #64. +typedef union +{ + std_max_align_t x; + long long y; + void *z; +} max_align_t; +} // namespace details + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of + // elements you expect to hold at once, especially if you have a high + // turnover rate; for example, on 32-bit x86, if you expect to have over a + // hundred million elements or pump several million elements through your + // queue in a very short space of time, using a 32-bit type *may* trigger a + // race condition. A 64-bit int type is recommended in that case, and in + // practice will prevent a race condition no matter the usage of the queue. + // Note that whether the queue is lock-free with a 64-int type depends on + // the whether std::atomic is lock-free, which is + // platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few + // elements but many producers, a smaller block size should be favoured. For + // few producers and/or many elements, a larger block size is preferred. A + // sane default is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per + // element. For large block sizes, this is too inefficient, and switching to + // an atomic counter-based approach is faster. The switch is made for block + // sizes strictly larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This + // should reflect that number's maximum for optimal performance. Must be a + // power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This + // should reflect that number's maximum for optimal performance. Must be a + // power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit + // producers. Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit + // production (using the enqueue methods without an explicit producer token) + // is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a + // token) must consume before it causes all consumers to rotate and move on + // to the next internal queue. + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a + // sub-queue. Enqueue operations that would cause this limit to be surpassed + // will fail. Note that this limit is enforced at the block level (for + // performance reasons), i.e. it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = + details::const_numeric_max::value; + + // The number of times to spin before sleeping when waiting on a semaphore. + // Recommended values are on the order of 1000-10000 unless the number of + // consumer threads exceeds the number of idle cores (in which case try + // 0-100). Only affects instances of the BlockingConcurrentQueue. + static const int MAX_SEMA_SPINS = 10000; + +#ifndef MCDBGQ_USE_RELACY + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like + // std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void *WORKAROUND_malloc(size_t size) + { + return malloc(size); + } + static inline void WORKAROUND_free(void *ptr) + { + return free(ptr); + } + static inline void *(malloc) (size_t size) + { + return WORKAROUND_malloc(size); + } + static inline void(free)(void *ptr) + { + return WORKAROUND_free(ptr); + } +#else + static inline void *malloc(size_t size) + { + return std::malloc(size); + } + static inline void free(void *ptr) + { + return std::free(ptr); + } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void *malloc(size_t size) + { + return rl::rl_malloc(size, $); + } + static inline void free(void *ptr) + { + return rl::rl_free(ptr, $); + } +#endif +}; + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template +class ConcurrentQueue; +template +class BlockingConcurrentQueue; +class ConcurrentQueueTests; + +namespace details +{ +struct ConcurrentQueueProducerTypelessBase +{ + ConcurrentQueueProducerTypelessBase *next; + std::atomic inactive; + ProducerToken *token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } +}; + +template +struct _hash_32_or_64 +{ + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see + // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is + // propagate that uniqueness evenly across all the bits, so that we can + // use a subset of the bits while reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } +}; +template <> +struct _hash_32_or_64<1> +{ + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } +}; +template +struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> +{ +}; + +static inline size_t hash_thread_id(thread_id_t id) +{ + static_assert( + sizeof(thread_id_t) <= 8, + "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast( + hash_32_or_64::thread_id_hash_t)>:: + hash(thread_id_converter::prehash(id))); +} + +template +static inline bool circular_less_than(T a, T b) +{ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4554) +#endif + static_assert( + std::is_integral::value && !std::numeric_limits::is_signed, + "circular_less_than is intended to be used only with unsigned integer " + "types"); + return static_cast(a - b) > + static_cast(static_cast(1) + << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif +} + +template +static inline char *align_for(char *ptr) +{ + const std::size_t alignment = std::alignment_of::value; + return ptr + + (alignment - (reinterpret_cast(ptr) % alignment)) % + alignment; +} + +template +static inline T ceil_to_pow_2(T x) +{ + static_assert( + std::is_integral::value && !std::numeric_limits::is_signed, + "ceil_to_pow_2 is intended to be used only with unsigned integer " + "types"); + + // Adapted from + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) + { + x |= x >> (i << 3); + } + ++x; + return x; +} + +template +static inline void swap_relaxed(std::atomic &left, std::atomic &right) +{ + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), + std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); +} + +template +static inline T const &nomove(T const &x) +{ + return x; +} + +template +struct nomove_if +{ + template + static inline T const &eval(T const &x) + { + return x; + } +}; + +template <> +struct nomove_if +{ + template + static inline auto eval(U &&x) -> decltype(std::forward(x)) + { + return std::forward(x); + } +}; + +template +static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT->decltype(*it) +{ + return *it; +} + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +template +struct is_trivially_destructible : std::is_trivially_destructible +{ +}; +#else +template +struct is_trivially_destructible : std::has_trivial_destructor +{ +}; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +typedef RelacyThreadExitListener ThreadExitListener; +typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else +struct ThreadExitListener +{ + typedef void (*callback_t)(void *); + callback_t callback; + void *userData; + + ThreadExitListener *next; // reserved for use by the ThreadExitNotifier +}; + +class ThreadExitNotifier +{ +public: + static void subscribe(ThreadExitListener *listener) + { + auto &tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener *listener) + { + auto &tlsInst = instance(); + ThreadExitListener **prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) + { + if (ptr == listener) + { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + +private: + ThreadExitNotifier() : tail(nullptr) + { + } + ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier &operator=(ThreadExitNotifier const &) + MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && + "If this assert fails, you likely have a buggy compiler! Change " + "the preprocessor conditions such that " + "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) + { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier &instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + +private: + ThreadExitListener *tail; +}; +#endif +#endif + +template +struct static_is_lock_free_num +{ + enum + { + value = 0 + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_CHAR_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_SHORT_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_INT_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LONG_LOCK_FREE + }; +}; +template <> +struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LLONG_LOCK_FREE + }; +}; +template +struct static_is_lock_free + : static_is_lock_free_num::type> +{ +}; +template <> +struct static_is_lock_free +{ + enum + { + value = ATOMIC_BOOL_LOCK_FREE + }; +}; +template +struct static_is_lock_free +{ + enum + { + value = ATOMIC_POINTER_LOCK_FREE + }; +}; +} // namespace details + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue &queue); + + template + explicit ProducerToken(BlockingConcurrentQueue &queue); + + ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) + { + producer->token = this; + } + } + + inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) + { + producer->token = this; + } + if (other.producer != nullptr) + { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const + { + return producer != nullptr; + } + + ~ProducerToken() + { + if (producer != nullptr) + { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase *producer; +}; + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue &q); + + template + explicit ConsumerToken(BlockingConcurrentQueue &q); + + ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), + lastKnownGlobalOffset(other.lastKnownGlobalOffset), + itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), + currentProducer(other.currentProducer), + desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; + +private: + template + friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase *currentProducer; + details::ConcurrentQueueProducerTypelessBase *desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See +// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT; + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = + static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = + static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + static_cast( + Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4307) // + integral constant overflow (that's what + // the ternary expression is for!) +#pragma warning(disable : 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = + (details::const_numeric_max::value - + static_cast(Traits::MAX_SUBQUEUE_SIZE) < + BLOCK_SIZE) + ? details::const_numeric_max::value + : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + + (BLOCK_SIZE - 1)) / + BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), + "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), + "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && + !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & + (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), + "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " + "power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && + !(EXPLICIT_INITIAL_INDEX_SIZE & + (EXPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && + !(IMPLICIT_INITIAL_INDEX_SIZE & + (IMPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert( + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || + !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || + INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at " + "least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be + // allocated up-front, which means only a single producer will be able to + // enqueue elements without an extra allocation -- blocks aren't shared + // between producers). This method is not thread safe -- it is up to the + // user to ensure that the queue is fully constructed before it starts being + // used by other threads (this includes making the memory effects of + // construction visible, possibly with a memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list( + capacity / BLOCK_SIZE + + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, + size_t maxExplicitProducers, + size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * + (maxExplicitProducers + 1) + + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) + { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) + { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) + { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) + { + auto prev = hash->prev; + if (prev != nullptr) + { // The last hash is part of this object and was not allocated + // dynamically + for (size_t i = 0; i != hash->capacity; ++i) + { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) + { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) + { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue &operator=(ConcurrentQueue const &) + MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT + : producerListTail( + other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex( + other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId( + other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load( + std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty + // queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store( + other.explicitProducers.load(std::memory_order_relaxed), + std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store( + other.implicitProducers.load(std::memory_order_relaxed), + std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue &operator=(ConcurrentQueue &&other) + MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue &swap_internal(ConcurrentQueue &other) + { + if (this == &other) + { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, + other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, + other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, + other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const &item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T &&item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const &token, T const &item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. Allocates memory if required. Only fails if memory + // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would + // be surpassed). Thread-safe. + inline bool enqueue(producer_token_t const &token, T &&item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note: + // Use std::make_move_iterator if the elements should be moved instead of + // copied. Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). Thread-safe. + inline bool try_enqueue(T const &item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T &&item) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const &token, T const &item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. Does not allocate memory. Fails if not enough room to + // enqueue. Thread-safe. + inline bool try_enqueue(producer_token_t const &token, T &&item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const &token, + It itemFirst, + size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U &item) + { + // Instead of simply trying each producer in turn (which could cause + // needless contention on the first producer), we score them + // heuristically. + size_t nonEmptyCount = 0; + ProducerBase *best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + nonEmptyCount < 3 && ptr != nullptr; + ptr = ptr->next_prod()) + { + auto size = ptr->size_approx(); + if (size > 0) + { + if (size > bestSize) + { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time we try to dequeue from it, we need to make sure every queue's + // been tried + if (nonEmptyCount > 0) + { + if ((details::likely)(best->dequeue(item))) + { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr != best && ptr->dequeue(item)) + { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall + // throughput under contention, but will give more predictable results in + // single-threaded consumer scenarios. This is mostly only useful for + // internal unit tests. Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U &item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr->dequeue(item)) + { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t &token, U &item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the + // global offset) -> this means the highest efficiency consumer dictates + // the rotation speed of everyone else, more or less If you see that the + // global offset has changed, you must reset your consumption counter + // and move to your designated place If there's no items where you're + // supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to + // consume, move over from your current position until you find an + // producer with something in it + + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + if (!update_current_producer_after_rotation(token)) + { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time we try to dequeue from it, we need to make sure every queue's + // been tried + if (static_cast(token.currentProducer)->dequeue(item)) + { + if (++token.itemsConsumedFromCurrent == + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) + { + globalExplicitConsumerOffset.fetch_add( + 1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = + static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) + { + if (ptr->dequeue(item)) + { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) + { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. Returns the number of items actually dequeued. Returns 0 + // if all producer streams appeared empty at the time they were checked (so, + // the queue is likely but not guaranteed to be empty). Never allocates. + // Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + if (!update_current_producer_after_rotation(token)) + { + return 0; + } + } + + size_t count = static_cast(token.currentProducer) + ->dequeue_bulk(itemFirst, max); + if (count == max) + { + if ((token.itemsConsumedFromCurrent += static_cast( + max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) + { + globalExplicitConsumerOffset.fetch_add( + 1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = + static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) + { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) + { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = + static_cast(dequeued); + } + if (dequeued == max) + { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) + { + ptr = tail; + } + } + return count; + } + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const &producer, + U &item) + { + return static_cast(producer.producer) + ->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner + // queue. Returns the number of items actually dequeued. If you happen to + // know which producer you want to dequeue from, this is significantly + // faster than using the general-case try_dequeue methods. Returns 0 if the + // producer's queue appeared empty at the time it was checked (so, the queue + // is likely but not guaranteed to be empty). Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer( + producer_token_t const &producer, It itemFirst, size_t max) + { + return static_cast(producer.producer) + ->dequeue_bulk(itemFirst, max); + } + + // Returns an estimate of the total number of elements currently in the + // queue. This estimate is only accurate if the queue has completely + // stabilized before it is called (i.e. all enqueue and dequeue operations + // have completed and their memory effects are visible on the calling + // thread, and no further operations start while this method is being + // called). Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + size += ptr->size_approx(); + } + return size; + } + + bool is_empty() const + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr->size_approx() > 0) + { + return false; + } + } + + return true; + } + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free< + typename details::thread_id_converter:: + thread_id_numeric_size_t>::value == 2; + } + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode + { + CanAlloc, + CannotAlloc + }; + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const &token, U &&element) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue( + std::forward(element)); + } + + template + inline bool inner_enqueue(U &&element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const &token, + It itemFirst, + size_t count) + { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk< + canAlloc>(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t &token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) + { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = + globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) + { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from + // end -- subtract from count first + std::uint32_t offset = + prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) + { + token.desiredProducer = + static_cast(token.desiredProducer) + ->next_prod(); + if (token.desiredProducer == nullptr) + { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) + { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) + { + token.desiredProducer = + static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) + { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) + { + } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the + // world under heavy contention, but simple and correct (assuming nodes are + // never freed until after the free list is destroyed), and fairly speedy + // under low contention. + template // N must inherit FreeListNode or have the same + // fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) + { + } + FreeList(FreeList &&other) + : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) + { + other.freeListHead.store(nullptr, std::memory_order_relaxed); + } + void swap(FreeList &other) + { + details::swap_relaxed(freeListHead, other.freeListHead); + } + + FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N *node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so + // it's safe to set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, + std::memory_order_acq_rel) == 0) + { + // Oh look! We were the last ones referencing this node, and we + // know we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N *try_get() + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) + { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || + !head->freeListRefs.compare_exchange_strong( + refs, + refs + 1, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at + // zero), which means we can read the next and not worry about + // it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong( + head, + next, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + // Yay, got the node. This means it was on the list, which + // means shouldBeOnFreeList must be false no matter the + // refcount (because nobody else knows it's been taken off + // yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & + SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for + // the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to + // decrease the refcount we increased. Note that we don't need + // to release any memory effects, but we do need to ensure that + // the reference count decrement happens-after the CAS on the + // head. + refs = prevHead->freeListRefs.fetch_sub( + 1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) + { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to + // destroy remaining nodes) + N *head_unsafe() const + { + return freeListHead.load(std::memory_order_relaxed); + } + + private: + inline void add_knowing_refcount_is_zero(N *node) + { + // Since the refcount is zero, and nobody can increase it once it's + // zero (except us, and we run only one copy of this method per node + // at a time, i.e. the single thread case), then we know we can + // safely change the next pointer of the node; however, once the + // refcount is back above zero, then other threads could increase it + // (happens under heavy contention, when the refcount goes to zero + // in between a load and a refcount increment of a node in try_get, + // then back up to something non-zero, then the refcount increment + // is done by the other thread) -- so, if the CAS to add the node to + // the actual list fails, decrease the refcount and leave the add + // operation to the next thread who puts the refcount back at zero + // (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) + { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong( + head, + node, + std::memory_order_release, + std::memory_order_relaxed)) + { + // Hmm, the add failed, but we can only try again when the + // refcount goes back to zero + if (node->freeListRefs.fetch_add( + SHOULD_BE_ON_FREELIST - 1, + std::memory_order_release) == 1) + { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes + // are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext + { + implicit_context = 0, + explicit_context = 1 + }; + + struct Block + { + Block() + : next(nullptr), + elementsCompletelyDequeued(0), + freeListRefs(0), + freeListNext(nullptr), + shouldBeOnFreeList(false), + dynamicallyAllocated(true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) + { + if (!emptyFlags[i].load(std::memory_order_relaxed)) + { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that + // happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else + { + // Check counter + if (elementsCompletelyDequeued.load( + std::memory_order_relaxed) == BLOCK_SIZE) + { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load( + std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit + // context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - + static_cast(i & static_cast( + BLOCK_SIZE - 1))] + .load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - + static_cast( + i & static_cast(BLOCK_SIZE - 1))] + .store(true, std::memory_order_release); + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add( + 1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no + // wrapping and count > 0). Returns true if the block is now empty (does + // not apply in explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, + size_t count) + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - + static_cast(i & + static_cast(BLOCK_SIZE - 1)) - + count + 1; + for (size_t j = 0; j != count; ++j) + { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else + { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add( + count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) + { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, + std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) + { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) + { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else + { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & + static_cast(BLOCK_SIZE - 1)); + } + inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT + { + return static_cast(static_cast(elements)) + + static_cast(idx & + static_cast(BLOCK_SIZE - 1)); + } + + private: + static_assert(std::alignment_of::value <= sizeof(T), + "The queue does not support types with an alignment " + "greater than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + + public: + Block *next; + std::atomic elementsCompletelyDequeued; + std::atomic + emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD + ? BLOCK_SIZE + : 1]; + + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be + // 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void *owner; +#endif + }; + static_assert(std::alignment_of::value >= + std::alignment_of::value, + "Internal error: Blocks must be at least as aligned as the " + "type they are wrapping"); + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats; + +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue *parent_, bool isExplicit_) + : tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() + { + } + + template + inline bool dequeue(U &element) + { + if (isExplicit) + { + return static_cast(this)->dequeue(element); + } + else + { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It &itemFirst, size_t max) + { + if (isExplicit) + { + return static_cast(this)->dequeue_bulk( + itemFirst, max); + } + else + { + return static_cast(this)->dequeue_bulk( + itemFirst, max); + } + } + + inline ProducerBase *next_prod() const + { + return static_cast(next); + } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) + ? static_cast(tail - head) + : 0; + } + + inline index_t getTail() const + { + return tailIndex.load(std::memory_order_relaxed); + } + + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block *tailBlock; + + public: + bool isExplicit; + ConcurrentQueue *parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue *parent_) + : ProducerBase(parent_, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = + details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) + { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index( + 0); // This creates an index with double the number of current + // entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) + { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block *halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) != 0) + { + // The head's not on a block boundary, meaning a block + // somewhere is partially dequeued (or the head block is the + // tail block and was fully dequeued, but the head/tail are + // still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & + (pr_blockIndexSize - 1); + while (details::circular_less_than( + pr_blockIndexEntries[i].base + BLOCK_SIZE, + this->headIndex.load(std::memory_order_relaxed))) + { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than( + pr_blockIndexEntries[i].base, + this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop + // gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do + { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty< + explicit_context>()) + { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) + { + i = static_cast( + this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the + // tail block, we need to stop when we reach the tail index + auto lastValidIndex = + (this->tailIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) == 0 + ? BLOCK_SIZE + : static_cast( + this->tailIndex.load( + std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && + (block != this->tailBlock || i != lastValidIndex)) + { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) + { + auto block = this->tailBlock; + do + { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) + { + destroy(block); + } + else + { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) + { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U &&element) + { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) + { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && + this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty()) + { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block:: + template reset_empty(); + + // We'll put the block on the block index (guaranteed to be + // room since we're conceptually removing the last block + // from it first -- except instead of removing then adding, + // we can just overwrite). Note that there must be a valid + // block index here, since even if allocation failed in the + // ctor, it would have been re-attempted when adding the + // first block to the queue; since there is such a block, a + // block index must have been successfully allocated. + } + else + { + // Whatever head value we see here is >= the last value we + // saw here (relatively), and <= its current value. Since we + // have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than( + currentTailIndex, head)); + if (!details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head))) + { + // We can't enqueue in another block because there's not + // enough leeway -- the tail could surpass the head by + // the time the block fills up! (Or we'll exceed the + // size limit, if the second part of the condition was + // true.) + return false; + } + // We're going to need a new block; check that the block + // index has room + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize) + { + // Hmm, the circular block index is already full -- + // we'll need to allocate a new index. Note + // pr_blockIndexRaw can only be nullptr if the initial + // allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + return false; + } + else if (!new_block_index(pr_blockIndexSlotsUsed)) + { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue:: + template requisition_block(); + if (newBlock == nullptr) + { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (this->tailBlock == nullptr) + { + newBlock->next = newBlock; + } + else + { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + // The constructor may throw. We want the element not to + // appear in the queue in that case (without corrupting the + // queue): + MOODYCAMEL_TRY + { + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + // Revert change to the current block, but leave the new + // block available for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? this->tailBlock + : startBlock; + MOODYCAMEL_RETHROW; + } + } + else + { + (void) startBlock; + (void) originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed) + ->front.store(pr_blockIndexFront, + std::memory_order_release); + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U &element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load( + std::memory_order_relaxed) - + overcommit, + tail)) + { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the + // common case when the queue is empty and the values are + // eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, + // they are not going to change (unless we change them) and must + // be the same value at this point (inside the if) as when the + // if condition was evaluated. + + // We insert an acquire fence here to synchronize-with the + // release upon incrementing dequeueOvercommit below. This + // ensures that whatever the value we got loaded into + // overcommit, the load of dequeueOptisticCount in the fetch_add + // below will result in a value at least as recent as that (and + // therefore at least as large). Note that I believe a compiler + // (signal) fence here would be sufficient due to the nature of + // fetch_add (all read-modify-write operations are guaranteed to + // work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only + // the C++11 standard. See + // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the + // boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= + // dequeueOptimisticCount (because dequeueOvercommit is only + // ever incremented after dequeueOptimisticCount -- this is + // enforced in the `else` block below), and since we now have a + // version of dequeueOptimisticCount that is at least as recent + // as overcommit (due to the release upon incrementing + // dequeueOvercommit and the acquire above that synchronizes + // with it), overcommit <= myDequeueCount. However, we can't + // assert this since both dequeueOptimisticCount and + // dequeueOvercommit may (independently) overflow; in such a + // case, though, the logic still holds since the difference + // between the two is maintained. + + // Note that we reload tail here in case it changed; it will be + // the same value as before or greater, since this load is + // sequenced after (happens after) the earlier load above. This + // is supported by read-read coherency (as defined in the + // standard), explained here: + // http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than( + myDequeueCount - overcommit, tail))) + { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be + // at least one element, this will never exceed tail. We + // need to do an acquire-release fence here since it's + // possible that whatever condition got us to this point was + // for an earlier enqueued element (that we already see the + // memory effects for), but that by the time we increment + // somebody else has incremented it, and we need to see the + // memory effects for *that* element, which is in such a + // case is necessarily visible on the thread that + // incremented it in the first place with the more current + // condition (they must have acquired a tail that is at + // least as recent). + auto index = + this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + + auto localBlockIndex = + blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing + // because of index wrap-around. When an index wraps, we + // need to preserve the sign of the offset when dividing it + // by the block size (in order to get a correct signed block + // count offset in all cases): + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = + index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + blockBaseIndex - headBase) / + BLOCK_SIZE); + auto block = localBlockIndex + ->entries[(localBlockIndexHead + offset) & + (localBlockIndex->size - 1)] + .block; + + // Dequeue + auto &el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T &&, element = std::move(el))) + { + // Make sure the element is still fully dequeued and + // destroyed even if the assignment throws + struct Guard + { + Block *block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block:: + template set_empty(index); + } + } guard = {block, index}; + + element = std::move(el); // NOLINT + } + else + { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty< + explicit_context>(index); + } + + return true; + } + else + { + // Wasn't anything to dequeue after all; make the effective + // dequeue count eventually consistent + this->dequeueOvercommit.fetch_add( + 1, + std::memory_order_release); // Release so that the + // fetch_add on + // dequeueOptimisticCount + // is guaranteed to happen + // before this write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of + // the elements; this means pre-allocating blocks and putting them + // in the block index (but only if all the allocations succeeded). + index_t startTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block *firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) + { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && + this->tailBlock->next != firstAllocatedBlock && + this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty()) + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than( + currentTailIndex, head)); + bool full = + !details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize || full) + { + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + // Failed to allocate, undo changes (but keep + // injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = + originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + else if (full || + !new_block_index(originalBlockIndexSlotsUsed)) + { + // Failed to allocate, undo changes (but keep + // injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = + originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, + // so we need to update our fallback value too (since we + // keep the new index even if we later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue:: + template requisition_block(); + if (newBlock == nullptr) + { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty< + explicit_context>(); + if (this->tailBlock == nullptr) + { + newBlock->next = newBlock; + } + else + { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's + // emptiness before we fill them up, and publish the new block + // index front + auto block = firstAllocatedBlock; + while (true) + { + block->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (block == this->tailBlock) + { + break; + } + block = block->next; + } + + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + blockIndex.load(std::memory_order_relaxed) + ->front.store( + (pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), + std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != + 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) + { + this->tailBlock = firstAllocatedBlock; + } + while (true) + { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, + stopIndex)) + { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex++]) + T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) + { + // Must use copy constructor even if move + // constructor is available because we may have to + // revert if there's an exception. Sorry about the + // horrible templated next line, but it was the only + // way to disable moving *at compile time*, which is + // important because a type may only define a + // (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if + // branch that will never be executed + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if(nullptr)) + T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + // Oh dear, an exception's been thrown -- destroy the + // elements that were enqueued so far and revert the + // entire bulk operation (we'll keep any allocated + // blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + + if (!details::is_trivially_destructible::value) + { + auto block = startBlock; + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) == 0) + { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) + { + stopIndex = + (currentTailIndex & + ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than( + constructedStopIndex, stopIndex)) + { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) + { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) + { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) + { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + if (firstAllocatedBlock != nullptr) + blockIndex.load(std::memory_order_relaxed) + ->front.store( + (pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), + std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It &itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - + (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) + { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) + { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) + { + this->dequeueOvercommit.fetch_add( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed + // to be at least actualCount elements, this will never + // exceed tail. + auto firstIndex = this->headIndex.fetch_add( + actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = + blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = + firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + firstBlockBaseIndex - headBase) / + BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & + (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do + { + auto firstIndexInBlock = index; + index_t endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, + T &&, + details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else + { + MOODYCAMEL_TRY + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + // It's too late to revert the dequeue, but we + // can make sure that all the dequeued objects + // are properly destroyed and the block index + // (and empty count) are properly updated before + // we propagate the exception + do + { + block = localBlockIndex->entries[indexIndex] + .block; + while (index != endIndex) + { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block:: + template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast( + endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & + (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast( + BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast( + actualCount), + endIndex) + ? firstIndex + static_cast( + actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast(endIndex - firstIndexInBlock)); + indexIndex = + (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else + { + // Wasn't anything to dequeue after all; make the effective + // dequeue count eventually consistent + this->dequeueOvercommit.fetch_add( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block *block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic + front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry *entries; + void *prev; + }; + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast( + (Traits::malloc)(sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) + { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast( + details::align_for(newRawPtr + + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) + { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & + prevBlockSizeMask; + do + { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, + std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the + // old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in + // referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry *pr_blockIndexEntries; + void *pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer *nextExplicitProducer; + + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue *parent_) + : ProducerBase(parent_, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all + // enqueue/dequeue operations completed already; this means that all + // undequeued elements are placed contiguously across contiguous + // blocks, and that only the first and last remaining blocks can be + // only partially empty (all other remaining blocks must be + // completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) + { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block *block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = + index != tail; // If we enter the loop, then the last (tail) + // block will not be freed + while (index != tail) + { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || + block == nullptr) + { + if (block != nullptr) + { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load( + std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on + // the free list (unless the head index reached the end of it, in + // which case the tail will be poised to create a new block). + if (this->tailBlock != nullptr && + (forceFreeLastBlock || + (tail & static_cast(BLOCK_SIZE - 1)) != 0)) + { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) + { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) + { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do + { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U &&element) + { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) + { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, + head)); + if (!details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head))) + { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block + // index + BlockIndexEntry *idxEntry; + if (!insert_block_index_entry(idxEntry, + currentTailIndex)) + { + return false; + } + + // Get ahold of a new block + auto newBlock = + this->parent->ConcurrentQueue::template requisition_block< + allocMode>(); + if (newBlock == nullptr) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context>(); + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + // May throw, try to insert now before we publish the fact + // that we have this new block + MOODYCAMEL_TRY + { + new ((*newBlock)[currentTailIndex]) + T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + MOODYCAMEL_CONSTEXPR_IF( + !MOODYCAMEL_NOEXCEPT_CTOR(T, + U, + new (static_cast(nullptr)) + T(std::forward(element)))) + { + this->tailIndex.store(newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U &element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load( + std::memory_order_relaxed) - + overcommit, + tail)) + { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than( + myDequeueCount - overcommit, tail))) + { + index_t index = + this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto &el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN( + T, T &&, element = std::move(el))) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead + // of only when a block is released is very sub-optimal, + // but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard + { + Block *block; + index_t index; + BlockIndexEntry *entry; + ConcurrentQueue *parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block:: + template set_empty( + index)) + { + entry->value.store( + nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = {block, index, entry, this->parent}; + + element = std::move(el); // NOLINT + } + else + { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty< + implicit_context>(index)) + { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool + // (and remove from block index) + entry->value.store(nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + } + + return true; + } + else + { + this->dequeueOvercommit.fetch_add( + 1, std::memory_order_release); + } + } + + return false; + } + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4706) // assignment within conditional expression +#endif + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of + // the elements; this means pre-allocating blocks and putting them + // in the block index (but only if all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us + // any more; this happens if it was filled up exactly to the top + // (setting tailIndex to the first index of the next block which is + // not yet allocated), then dequeued completely (putting it on the + // free list) before we enqueue again. + + index_t startTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block *firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do + { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block + // index + BlockIndexEntry *idxEntry = + nullptr; // initialization here unnecessary but + // compiler can't always tell + Block *newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than( + currentTailIndex, head)); + bool full = + !details::circular_less_than( + head, currentTailIndex + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != + details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < + currentTailIndex - head)); + + if (full || + !(indexInserted = insert_block_index_entry( + idxEntry, currentTailIndex)) || + (newBlock = + this->parent->ConcurrentQueue:: + template requisition_block()) == + nullptr) + { + // Index allocation or block allocation failed; revert + // any other allocations and index insertions done so + // far for this operation + if (indexInserted) + { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + } + currentTailIndex = + (startTailIndex - 1) & + ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) + { + currentTailIndex += + static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index( + currentTailIndex); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list( + firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context>(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later + // allocations fail, and so that we can find the blocks when + // we do the actual enqueueing + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) != 0 || + firstAllocatedBlock != nullptr) + { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? newBlock + : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != + 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) + { + this->tailBlock = firstAllocatedBlock; + } + while (true) + { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, + stopIndex)) + { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, + decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex++]) + T(*itemFirst++); + } + } + else + { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) + { + new ((*this->tailBlock)[currentTailIndex]) + T(details::nomove_if(nullptr)) + T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) + { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) + { + auto block = startBlock; + if ((startTailIndex & + static_cast(BLOCK_SIZE - 1)) == 0) + { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) + { + stopIndex = + (currentTailIndex & + ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than( + constructedStopIndex, stopIndex)) + { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) + { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) + { + break; + } + block = block->next; + } + } + + currentTailIndex = + (startTailIndex - 1) & + ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) + { + currentTailIndex += + static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index( + currentTailIndex); + idxEntry->value.store(nullptr, + std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list( + firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) + { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + template + size_t dequeue_bulk(It &itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - + (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) + { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) + { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) + { + this->dequeueOvercommit.fetch_add( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed + // to be at least actualCount elements, this will never + // exceed tail. + auto firstIndex = this->headIndex.fetch_add( + actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader *localBlockIndex; + auto indexIndex = + get_block_index_index_for_index(index, localBlockIndex); + do + { + auto blockStartIndex = index; + index_t endIndex = + (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = + entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN( + T, + T &&, + details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else + { + MOODYCAMEL_TRY + { + while (index != endIndex) + { + auto &el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) + { + do + { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load( + std::memory_order_relaxed); + while (index != endIndex) + { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block:: + template set_many_empty< + implicit_context>( + blockStartIndex, + static_cast( + endIndex - + blockStartIndex))) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store( + nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list( + block); + } + indexIndex = + (indexIndex + 1) & + (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast( + BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast( + actualCount), + endIndex) + ? firstIndex + static_cast( + actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block:: + template set_many_empty( + blockStartIndex, + static_cast(endIndex - + blockStartIndex))) + { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a + // release, meaning that anybody who acquires + // the block we're about to free can use it + // safely since our writes (and reads!) will + // have happened-before then. + entry->value.store(nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + indexIndex = + (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else + { + this->dequeueOvercommit.fetch_add( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an + // invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry *entries; + BlockIndexEntry **index; + BlockIndexHeader *prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry, + index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load( + std::memory_order_relaxed); // We're the only writer thread, + // relaxed is OK + if (localBlockIndex == nullptr) + { + return false; // this can happen if new_block_index failed in + // the constructor + } + size_t newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == + INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) + { + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) + { + return false; + } + else if (!new_block_index()) + { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == + INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store( + (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & + (localBlockIndex->capacity - 1), + std::memory_order_relaxed); + } + + inline BlockIndexEntry *get_block_index_entry_for_index( + index_t index) const + { + BlockIndexHeader *localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index( + index_t index, BlockIndexHeader *&localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load( + std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may + // wrap around, causing a negative offset, whose negativity we want + // to preserve + auto offset = static_cast( + static_cast::type>( + index - tailBase) / + BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load( + std::memory_order_relaxed) == index && + localBlockIndex->index[idx]->value.load( + std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = + prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry *) * nextBlockIndexCapacity)); + if (raw == nullptr) + { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast( + details::align_for(raw + + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast( + details::align_for( + reinterpret_cast(entries) + + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) + { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do + { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) + { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, + std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store( + (prevCapacity - 1) & (nextBlockIndexCapacity - 1), + std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer *nextImplicitProducer; + + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) + { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) + { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) + { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block *try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= + initialBlockPoolSize) + { + return nullptr; + } + + auto index = + initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) + : nullptr; + } + + inline void add_block_to_free_list(Block *block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block *block) + { + while (block != nullptr) + { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block *try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one + // (if applicable) + template + Block *requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) + { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) + { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) + { + return create(); + } + else + { + return nullptr; + } + } + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats + { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue *q) + { + MemStats stats = {0}; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) + { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + bool implicit = + dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) + { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = + prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) + { + for (size_t i = 0; i != hash->capacity; ++i) + { + if (hash->index[i]->key.load( + std::memory_order_relaxed) != + ImplicitProducer::INVALID_BLOCK_BASE && + hash->index[i]->value.load( + std::memory_order_relaxed) != nullptr) + { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += + hash->capacity * + sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) + { + stats.implicitBlockIndexBytes += + sizeof(typename ImplicitProducer:: + BlockIndexHeader) + + hash->capacity * + sizeof(typename ImplicitProducer:: + BlockIndexEntry *); + } + } + for (; details::circular_less_than(head, tail); + head += BLOCK_SIZE) + { + // auto block = + // prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else + { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) + { + auto block = tailBlock; + do + { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block:: + template is_empty() || + wasNonEmpty) + { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = + prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) + { + stats.explicitBlockIndexBytes += + sizeof( + typename ExplicitProducer::BlockIndexHeader) + + index->size * + sizeof( + typename ExplicitProducer::BlockIndexEntry); + index = static_cast< + typename ExplicitProducer::BlockIndexHeader *>( + index->prev); + } + } + } + + auto freeOnInitialPool = + q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= + q->initialBlockPoolSize + ? 0 + : q->initialBlockPoolSize - q->initialBlockPoolIndex.load( + std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + +private: + friend struct MemStats; +#endif + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase *recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase *recycle_or_create_producer(bool isExplicit, bool &recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; + ptr = ptr->next_prod()) + { + if (ptr->inactive.load(std::memory_order_relaxed) && + ptr->isExplicit == isExplicit) + { + bool expected = true; + if (ptr->inactive.compare_exchange_strong( + expected, + /* desired */ false, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + // We caught one! It's been marked as activated, the caller + // can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast( + create(this)) + : create(this)); + } + + ProducerBase *add_producer(ProducerBase *producer) + { + // Handle failed memory allocation + if (producer == nullptr) + { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do + { + producer->next = prevTail; + } while ( + !producerListTail.compare_exchange_weak(prevTail, + producer, + std::memory_order_release, + std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) + { + auto prevTailExplicit = + explicitProducers.load(std::memory_order_relaxed); + do + { + static_cast(producer) + ->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak( + prevTailExplicit, + static_cast(producer), + std::memory_order_release, + std::memory_order_relaxed)); + } + else + { + auto prevTailImplicit = + implicitProducers.load(std::memory_order_relaxed); + do + { + static_cast(producer) + ->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak( + prevTailImplicit, + static_cast(producer), + std::memory_order_release, + std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); + ptr != nullptr; + ptr = ptr->next_prod()) + { + ptr->parent = this; + } + } + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer + *value; // No need for atomicity since it's only read by the thread + // that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) + { + } + + ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), + std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP &operator=(ImplicitProducerKVP &&other) + MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) + { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap( + typename ConcurrentQueue::ImplicitProducerKVP &, + typename ConcurrentQueue::ImplicitProducerKVP &) + MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP *entries; + ImplicitProducerHash *prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + { + return; + } + else + { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) + { + initialImplicitProducerHashEntries[i].key.store( + details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue &other) + { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + { + return; + } + else + { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap( + other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = + &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = + &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, + other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, + other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == + &other.initialImplicitProducerHash) + { + implicitProducerHash.store(&initialImplicitProducerHash, + std::memory_order_relaxed); + } + else + { + ImplicitProducerHash *hash; + for (hash = + implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &other.initialImplicitProducerHash; + hash = hash->prev) + { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == + &initialImplicitProducerHash) + { + other.implicitProducerHash.store( + &other.initialImplicitProducerHash, + std::memory_order_relaxed); + } + else + { + ImplicitProducerHash *hash; + for (hash = other.implicitProducerHash.load( + std::memory_order_relaxed); + hash->prev != &initialImplicitProducerHash; + hash = hash->prev) + { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer *get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread + // ID), there's a reduced need for fences (memory ordering is already + // consistent for any individual thread), except for the current table + // itself. + + // Start by looking for the thread ID in the current and all previous + // hash tables. If it's not found, it must not be in there yet, since + // this same thread would have added it previously to one of the tables + // that we traversed. + + // Code and algorithm adapted from + // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings + // (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) + { + // Look for the id in this hash + auto index = hashedId; + while (true) + { // Not an infinite loop because at least one slot is free in the + // hash table + index &= hash->capacity - 1; + + auto probedKey = + hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) + { + // Found it! If we had to search several hashes deep, + // though, we should lazily add it to the current main hash + // table to avoid the extended search next time. Note + // there's guaranteed to be room in the current hash table + // since every subsequent table implicitly reserves space + // for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) + { + index = hashedId; + while (true) + { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load( + std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && + mainHash->entries[index] + .key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed)) || + (probedKey == reusable && + mainHash->entries[index] + .key.compare_exchange_strong( + reusable, + id, + std::memory_order_acquire, + std::memory_order_acquire))) + { +#else + if ((probedKey == empty && + mainHash->entries[index] + .key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed))) + { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) + { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add( + 1, std::memory_order_relaxed); + while (true) + { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && + !implicitProducerHashResizeInProgress.test_and_set( + std::memory_order_acquire)) + { + // We've acquired the resize lock, try to allocate a bigger hash + // table. Note the acquire fence synchronizes with the release + // fence at the end of this block, and hence when we reload + // implicitProducerHash it must be the most recent version (it + // only gets changed within this locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) + { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) + { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)( + sizeof(ImplicitProducerHash) + + std::alignment_of::value - 1 + + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) + { + // Allocation failed + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear( + std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = static_cast(newCapacity); + newHash->entries = reinterpret_cast( + details::align_for( + raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) + { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store( + details::invalid_thread_id, + std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, + std::memory_order_release); + implicitProducerHashResizeInProgress.clear( + std::memory_order_release); + mainHash = newHash; + } + else + { + implicitProducerHashResizeInProgress.clear( + std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that + // we don't have to wait for the next table to finish being + // allocated by another thread (and if we just finished allocating + // above, the condition will always be true) + if (newCount < + (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) + { + bool recycled; + auto producer = static_cast( + recycle_or_create_producer(false, recycled)); + if (producer == nullptr) + { + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) + { + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = + &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe( + &producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) + { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load( + std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && + mainHash->entries[index].key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed)) || + (probedKey == reusable && + mainHash->entries[index].key.compare_exchange_strong( + reusable, + id, + std::memory_order_acquire, + std::memory_order_acquire))) + { +#else + if ((probedKey == empty && + mainHash->entries[index].key.compare_exchange_strong( + empty, + id, + std::memory_order_relaxed, + std::memory_order_relaxed))) + { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy + // allocating a new one. We need to wait for the allocating thread + // to finish (if it succeeds, we add, if not, we try to allocate + // ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer *producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != + nullptr); // The thread exit listener is only registered if we + // were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't + // on the current one yet and are trying to add an entry thinking + // there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) + { + auto index = hashedId; + do + { + index &= hash->capacity - 1; + probedKey = + hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) + { + hash->entries[index].key.store(details::invalid_thread_id2, + std::memory_order_release); + break; + } + ++index; + } while (probedKey != + details::invalid_thread_id); // Can happen if the hash has + // changed but we weren't put + // back in it yet, or if we + // weren't added to this hash + // in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void *userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline void *aligned_malloc(size_t size) + { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= + std::alignment_of::value) + return (Traits::malloc)(size); + else + { + size_t alignment = std::alignment_of::value; + void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *)); + if (!raw) + return nullptr; + char *ptr = details::align_for( + reinterpret_cast(raw) + sizeof(void *)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + } + + template + static inline void aligned_free(void *ptr) + { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= + std::alignment_of::value) + return (Traits::free)(ptr); + else(Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) + : nullptr); + } + + template + static inline U *create_array(size_t count) + { + assert(count > 0); + U *p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template + static inline void destroy_array(U *p, size_t count) + { + if (p != nullptr) + { + assert(count > 0); + for (size_t i = count; i != 0;) + (p + --i)->~U(); + } + aligned_free(p); + } + + template + static inline U *create() + { + void *p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U *create(A1 &&a1) + { + void *p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U *p) + { + if (p != nullptr) + p->~U(); + aligned_free(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block *initialBlockPool; + size_t initialBlockPoolSize; + +#ifndef MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic + implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array + initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + +template +ProducerToken::ProducerToken(ConcurrentQueue &queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) + { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue &queue) + : producer(reinterpret_cast *>(&queue) + ->recycle_or_create_producer(true)) +{ + if (producer != nullptr) + { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue &queue) + : itemsConsumedFromCurrent(0), + currentProducer(nullptr), + desiredProducer(nullptr) +{ + initialOffset = + queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue &queue) + : itemsConsumedFromCurrent(0), + currentProducer(nullptr), + desiredProducer(nullptr) +{ + initialOffset = + reinterpret_cast *>(&queue) + ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +inline void swap(ConcurrentQueue &a, + ConcurrentQueue &b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} // namespace moodycamel + +#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) +#pragma warning(pop) +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif From 0145eed037aee6425a52ed52260ad7ed19d64221 Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Thu, 10 Aug 2023 19:19:23 +0800 Subject: [PATCH 02/20] add remote queue size bvar --- src/bthread/task_control.cpp | 25 +++++++++++++++++++++++++ src/bthread/task_control.h | 3 +++ 2 files changed, 28 insertions(+) diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index 15f1d7b693..ceb476ca04 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -30,6 +30,7 @@ #include "bthread/task_group.h" // TaskGroup #include "bthread/task_control.h" #include "bthread/timer_thread.h" // global_timer_thread +#include #include #include "bthread/log.h" @@ -113,6 +114,11 @@ static void print_rq_sizes_in_the_tc(std::ostream &os, void *arg) { tc->print_rq_sizes(os); } +static void print_resume_q_sizes_in_the_tc(std::ostream &os, void *arg) { + TaskControl *tc = (TaskControl *)arg; + tc->print_resume_q_sizes(os); +} + static double get_cumulated_worker_time_from_this(void *arg) { return static_cast(arg)->get_cumulated_worker_time(); } @@ -143,6 +149,7 @@ TaskControl::TaskControl() , _cumulated_signal_count(get_cumulated_signal_count_from_this, this) , _signal_per_second(&_cumulated_signal_count) , _status(print_rq_sizes_in_the_tc, this) + , _resume_q_status(print_resume_q_sizes_in_the_tc, this) , _nbthreads("bthread_count") { // calloc shall set memory to zero @@ -178,6 +185,7 @@ int TaskControl::init(int concurrency) { _switch_per_second.expose("bthread_switch_second"); _signal_per_second.expose("bthread_signal_second"); _status.expose("bthread_group_status"); + _resume_q_status.expose("bthread_group_resume_q_status_"); // Wait for at least one group is added so that choose_one_group() // never returns NULL. @@ -259,6 +267,7 @@ TaskControl::~TaskControl() { _switch_per_second.hide(); _signal_per_second.hide(); _status.hide(); + _resume_q_status.hide(); stop_and_join(); @@ -415,6 +424,22 @@ void TaskControl::print_rq_sizes(std::ostream& os) { } } +void TaskControl::print_resume_q_sizes(std::ostream &os) { + const size_t ngroup = _ngroup.load(butil::memory_order_relaxed); + DEFINE_SMALL_ARRAY(int, nums, ngroup, 128); + { + BAIDU_SCOPED_LOCK(_modify_group_mutex); + // ngroup > _ngroup: nums[_ngroup ... ngroup-1] = 0 + // ngroup < _ngroup: just ignore _groups[_ngroup ... ngroup-1] + for (size_t i = 0; i < ngroup; ++i) { + nums[i] = (_groups[i] ? _groups[i]->_resume_rq_cnt.load(std::memory_order_relaxed) : 0); + } + } + for (size_t i = 0; i < ngroup; ++i) { + os << nums[i] << ' '; + } +} + double TaskControl::get_cumulated_worker_time() { int64_t cputime_ns = 0; BAIDU_SCOPED_LOCK(_modify_group_mutex); diff --git a/src/bthread/task_control.h b/src/bthread/task_control.h index e318c26501..c9ca3675ab 100644 --- a/src/bthread/task_control.h +++ b/src/bthread/task_control.h @@ -66,6 +66,8 @@ class TaskControl { void print_rq_sizes(std::ostream& os); + void print_resume_q_sizes(std::ostream& os); + double get_cumulated_worker_time(); int64_t get_cumulated_switch_count(); int64_t get_cumulated_signal_count(); @@ -110,6 +112,7 @@ class TaskControl { bvar::PassiveStatus _cumulated_signal_count; bvar::PerSecond > _signal_per_second; bvar::PassiveStatus _status; + bvar::PassiveStatus _resume_q_status; bvar::Adder _nbthreads; static const int PARKING_LOT_NUM = 4; From b0e2b9ef3aa057ad1b60edb284a0e0c9336a9a42 Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Fri, 11 Aug 2023 13:08:41 +0800 Subject: [PATCH 03/20] add bvar consume command and socket write latency; remove busy loop in wait_task --- src/brpc/policy/redis_protocol.cpp | 10 ++++++++++ src/bthread/task_group.cpp | 8 -------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/brpc/policy/redis_protocol.cpp b/src/brpc/policy/redis_protocol.cpp index 94524e8b75..5e92453ede 100644 --- a/src/brpc/policy/redis_protocol.cpp +++ b/src/brpc/policy/redis_protocol.cpp @@ -33,6 +33,7 @@ #include "brpc/redis.h" #include "brpc/redis_command.h" #include "brpc/policy/redis_protocol.h" +#include "bvar/latency_recorder.h" namespace brpc { @@ -144,6 +145,9 @@ void RedisConnContext::Destroy() { // ========== impl of RedisConnContext ========== +inline bvar::LatencyRecorder socket_write_latency("socket", "write"); +inline bvar::LatencyRecorder consume_cmd_latency("socket", "consume_cmd"); + ParseResult ParseRedisMessage(butil::IOBuf* source, Socket* socket, bool read_eof, const void* arg) { if (read_eof || source->empty()) { @@ -174,22 +178,28 @@ ParseResult ParseRedisMessage(butil::IOBuf* source, Socket* socket, if (err != PARSE_OK) { break; } + int64_t start_time_us = butil::cpuwide_time_us(); if (ConsumeCommand(ctx, current_args, false, &appender) != 0) { return MakeParseError(PARSE_ERROR_ABSOLUTELY_WRONG); } + consume_cmd_latency << (butil::cpuwide_time_us() - start_time_us); current_args.swap(next_args); } + int64_t start_time_us = butil::cpuwide_time_us(); if (ConsumeCommand(ctx, current_args, true /*must be the last message*/, &appender) != 0) { return MakeParseError(PARSE_ERROR_ABSOLUTELY_WRONG); } + consume_cmd_latency << (butil::cpuwide_time_us() - start_time_us); butil::IOBuf sendbuf; appender.move_to(sendbuf); CHECK(!sendbuf.empty()); Socket::WriteOptions wopt; wopt.ignore_eovercrowded = true; + start_time_us = butil::cpuwide_time_us(); LOG_IF(WARNING, socket->Write(&sendbuf, &wopt) != 0) << "Fail to send redis reply"; + socket_write_latency << (butil::cpuwide_time_us() - start_time_us); if(ctx->parser.ParsedArgsSize() == 0) { ctx->arena.clear(); } diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 5a61b3739f..104bd6f5c8 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -119,7 +119,6 @@ bool TaskGroup::is_stopped(bthread_t tid) { } bool TaskGroup::wait_task(bthread_t* tid) { - int64_t wait_begin_ms = butil::cpuwide_time_ms(); do { #ifndef BTHREAD_DONT_SAVE_PARKING_STATE if (_last_pl_state.stopped()) { @@ -129,15 +128,8 @@ bool TaskGroup::wait_task(bthread_t* tid) { if (pop_resume_task(tid)) { return true; } - if (steal_task(tid)) { - return true; - } - if(butil::cpuwide_time_ms() - wait_begin_ms <= 5000){ - continue; - } _pl->wait(_last_pl_state); - wait_begin_ms = butil::cpuwide_time_ms(); if (steal_task(tid)) { return true; } From bd5427091227ff51fbf471a2f03d04f293e41a90 Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Fri, 25 Aug 2023 17:15:20 +0800 Subject: [PATCH 04/20] include fix --- src/bthread/task_group.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index 8e1193501f..f29014047c 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -30,7 +30,7 @@ #include "butil/resource_pool.h" // ResourceId #include "bthread/parking_lot.h" -#include "thirdparty/moodycamelqueue.h" +#include "moodycamelqueue.h" namespace bthread { From 8158abdde1f9f7257d6b061fb3a28594eecc27f5 Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Fri, 25 Aug 2023 17:29:22 +0800 Subject: [PATCH 05/20] remove duplicate header --- src/thirdparty/moodycamelqueue.h | 5255 ------------------------------ 1 file changed, 5255 deletions(-) delete mode 100644 src/thirdparty/moodycamelqueue.h diff --git a/src/thirdparty/moodycamelqueue.h b/src/thirdparty/moodycamelqueue.h deleted file mode 100644 index d0d042f6b3..0000000000 --- a/src/thirdparty/moodycamelqueue.h +++ /dev/null @@ -1,5255 +0,0 @@ -// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free -// queue. An overview, including benchmark results, is provided here: -// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ -// The full design is also described in excruciating detail at: -// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue - -// Simplified BSD license: -// Copyright (c) 2013-2020, Cameron Desrochers. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. - -// Also dual-licensed under the Boost Software License (see LICENSE.md) - -#pragma once - -#if defined(__GNUC__) -// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and -// Traits::index_t are set to < 32 bits, causing integer promotion, causing -// warnings upon assigning any computed values) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wconversion" - -#ifdef MCDBGQ_USE_RELACY -#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" -#endif -#endif - -#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) -// VS2019 with /W4 warns about constant conditional expressions but unless -// /std=c++17 or higher does not support `if constexpr`, so we have no choice -// but to simply disable the warning -#pragma warning(push) -#pragma warning(disable : 4127) // conditional expression is constant -#endif - -#if defined(__APPLE__) -#include "TargetConditionals.h" -#endif - -#ifdef MCDBGQ_USE_RELACY -#include "relacy/relacy_std.hpp" -#include "relacy_shims.h" -// We only use malloc/free anyway, and the delete macro messes up `= delete` -// method declarations. We'll override the default trait malloc ourselves -// without a macro. -#undef new -#undef delete -#undef malloc -#undef free -#else -#include // Requires C++11. Sorry VS2010. -#include -#endif -#include -#include -#include // for CHAR_BIT -#include // for max_align_t -#include -#include -#include -#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading -#include -#include - -// Platform-specific definitions of a numeric thread ID type and an invalid -// value -namespace moodycamel -{ -namespace details -{ -template -struct thread_id_converter -{ - typedef thread_id_t thread_id_numeric_size_t; - typedef thread_id_t thread_id_hash_t; - static thread_id_hash_t prehash(thread_id_t const &x) - { - return x; - } -}; -} // namespace details -} // namespace moodycamel -#if defined(MCDBGQ_USE_RELACY) -namespace moodycamel -{ -namespace details -{ -typedef std::uint32_t thread_id_t; -static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; -static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; -static inline thread_id_t thread_id() -{ - return rl::thread_index(); -} -} // namespace details -} // namespace moodycamel -#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) -// No sense pulling in windows.h in a header, we'll manually declare the -// function we use and rely on backwards-compatibility for this not to break -extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId( - void); -namespace moodycamel -{ -namespace details -{ -static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), - "Expected size of unsigned long to be 32 bits on Windows"); -typedef std::uint32_t thread_id_t; -static const thread_id_t invalid_thread_id = - 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx -static const thread_id_t invalid_thread_id2 = - 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used - // in practice. Note that all Win32 thread IDs are presently - // multiples of 4. -static inline thread_id_t thread_id() -{ - return static_cast(::GetCurrentThreadId()); -} -} // namespace details -} // namespace moodycamel -#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ - (defined(__APPLE__) && TARGET_OS_IPHONE) -namespace moodycamel -{ -namespace details -{ -static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, - "std::thread::id is expected to be either 4 or 8 bytes"); - -typedef std::thread::id thread_id_t; -static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID - -// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have -// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined -// anyway, which it won't be. -static inline thread_id_t thread_id() -{ - return std::this_thread::get_id(); -} - -template -struct thread_id_size -{ -}; -template <> -struct thread_id_size<4> -{ - typedef std::uint32_t numeric_t; -}; -template <> -struct thread_id_size<8> -{ - typedef std::uint64_t numeric_t; -}; - -template <> -struct thread_id_converter -{ - typedef thread_id_size::numeric_t - thread_id_numeric_size_t; -#ifndef __APPLE__ - typedef std::size_t thread_id_hash_t; -#else - typedef thread_id_numeric_size_t thread_id_hash_t; -#endif - - static thread_id_hash_t prehash(thread_id_t const &x) - { -#ifndef __APPLE__ - return std::hash()(x); -#else - return *reinterpret_cast(&x); -#endif - } -}; -} -} -#else -// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 -// In order to get a numeric thread ID in a platform-independent way, we use a -// thread-local static variable's address as a thread identifier :-) -#if defined(__GNUC__) || defined(__INTEL_COMPILER) -#define MOODYCAMEL_THREADLOCAL __thread -#elif defined(_MSC_VER) -#define MOODYCAMEL_THREADLOCAL __declspec(thread) -#else -// Assume C++11 compliant compiler -#define MOODYCAMEL_THREADLOCAL thread_local -#endif -namespace moodycamel -{ -namespace details -{ -typedef std::uintptr_t thread_id_t; -static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr -static const thread_id_t invalid_thread_id2 = - 1; // Member accesses off a null pointer are also generally invalid. Plus - // it's not aligned. -inline thread_id_t thread_id() -{ - static MOODYCAMEL_THREADLOCAL int x; - return reinterpret_cast(&x); -} -} -} -#endif - -// Constexpr if -#ifndef MOODYCAMEL_CONSTEXPR_IF -#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \ - __cplusplus > 201402L -#define MOODYCAMEL_CONSTEXPR_IF if constexpr -#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] -#else -#define MOODYCAMEL_CONSTEXPR_IF if -#define MOODYCAMEL_MAYBE_UNUSED -#endif -#endif - -// Exceptions -#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED -#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || \ - (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ - (!defined(_MSC_VER) && !defined(__GNUC__)) -#define MOODYCAMEL_EXCEPTIONS_ENABLED -#endif -#endif -#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED -#define MOODYCAMEL_TRY try -#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) -#define MOODYCAMEL_RETHROW throw -#define MOODYCAMEL_THROW(expr) throw(expr) -#else -#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true) -#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false) -#define MOODYCAMEL_RETHROW -#define MOODYCAMEL_THROW(expr) -#endif - -#ifndef MOODYCAMEL_NOEXCEPT -#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) -#define MOODYCAMEL_NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true -#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 -// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when -// it shouldn't :-( We have to assume *all* non-trivial constructors may throw -// on VS2012! -#define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ - (std::is_rvalue_reference::value && \ - std::is_move_constructible::value \ - ? std::is_trivially_move_constructible::value \ - : std::is_trivially_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ - ((std::is_rvalue_reference::value && \ - std::is_move_assignable::value \ - ? std::is_trivially_move_assignable::value || \ - std::is_nothrow_move_assignable::value \ - : std::is_trivially_copy_assignable::value || \ - std::is_nothrow_copy_assignable::value) && \ - MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) -#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 -#define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ - (std::is_rvalue_reference::value && \ - std::is_move_constructible::value \ - ? std::is_trivially_move_constructible::value || \ - std::is_nothrow_move_constructible::value \ - : std::is_trivially_copy_constructible::value || \ - std::is_nothrow_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ - ((std::is_rvalue_reference::value && \ - std::is_move_assignable::value \ - ? std::is_trivially_move_assignable::value || \ - std::is_nothrow_move_assignable::value \ - : std::is_trivially_copy_assignable::value || \ - std::is_nothrow_copy_assignable::value) && \ - MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) -#else -#define MOODYCAMEL_NOEXCEPT noexcept -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) -#endif -#endif - -#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED -#ifdef MCDBGQ_USE_RELACY -#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED -#else -// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a -// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't -// support thread_local either. Finally, iOS/ARM doesn't have support for it -// either, and g++/ARM allows it to compile but it's unconfirmed to actually -// work -#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ - (!defined(__MINGW32__) && !defined(__MINGW64__) || \ - !defined(__WINPTHREADS_VERSION)) && \ - (!defined(__GNUC__) || __GNUC__ > 4 || \ - (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ - (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ - !defined(_M_ARM) && !defined(__aarch64__) -// Assume `thread_local` is fully supported in all other C++11 -// compilers/platforms -//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now -// since several users report having problems with it on -#endif -#endif -#endif - -// VS2012 doesn't support deleted functions. -// In this case, we declare the function normally but don't define it. A link -// error will be generated if the function is called. -#ifndef MOODYCAMEL_DELETE_FUNCTION -#if defined(_MSC_VER) && _MSC_VER < 1800 -#define MOODYCAMEL_DELETE_FUNCTION -#else -#define MOODYCAMEL_DELETE_FUNCTION = delete -#endif -#endif - -namespace moodycamel -{ -namespace details -{ -#ifndef MOODYCAMEL_ALIGNAS -// VS2013 doesn't support alignas or alignof, and align() requires a constant -// literal -#if defined(_MSC_VER) && _MSC_VER <= 1800 -#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) -#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) -#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ - typename details::Vs2013Aligned::value, T>::type -template -struct Vs2013Aligned -{ -}; // default, unsupported alignment -template -struct Vs2013Aligned<1, T> -{ - typedef __declspec(align(1)) T type; -}; -template -struct Vs2013Aligned<2, T> -{ - typedef __declspec(align(2)) T type; -}; -template -struct Vs2013Aligned<4, T> -{ - typedef __declspec(align(4)) T type; -}; -template -struct Vs2013Aligned<8, T> -{ - typedef __declspec(align(8)) T type; -}; -template -struct Vs2013Aligned<16, T> -{ - typedef __declspec(align(16)) T type; -}; -template -struct Vs2013Aligned<32, T> -{ - typedef __declspec(align(32)) T type; -}; -template -struct Vs2013Aligned<64, T> -{ - typedef __declspec(align(64)) T type; -}; -template -struct Vs2013Aligned<128, T> -{ - typedef __declspec(align(128)) T type; -}; -template -struct Vs2013Aligned<256, T> -{ - typedef __declspec(align(256)) T type; -}; -#else -template -struct identity -{ - typedef T type; -}; -#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) -#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) -#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ - alignas(alignof(obj)) typename details::identity::type -#endif -#endif -} // namespace details -} // namespace moodycamel - -// TSAN can false report races in lock-free code. To enable TSAN to be used -// from projects that use this one, we can apply per-function compile-time -// suppression. See -// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer -#define MOODYCAMEL_NO_TSAN -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#undef MOODYCAMEL_NO_TSAN -#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) -#endif // TSAN -#endif // TSAN - -// Compiler-specific likely/unlikely hints -namespace moodycamel -{ -namespace details -{ -#if defined(__GNUC__) -static inline bool(likely)(bool x) -{ - return __builtin_expect((x), true); -} -static inline bool(unlikely)(bool x) -{ - return __builtin_expect((x), false); -} -#else -static inline bool(likely)(bool x) -{ - return x; -} -static inline bool(unlikely)(bool x) -{ - return x; -} -#endif -} // namespace details -} // namespace moodycamel - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG -#include "internal/concurrentqueue_internal_debug.h" -#endif - -namespace moodycamel -{ -namespace details -{ -template -struct const_numeric_max -{ - static_assert(std::is_integral::value, - "const_numeric_max can only be used with integers"); - static const T value = - std::numeric_limits::is_signed - ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - - static_cast(1) - : static_cast(-1); -}; - -#if defined(__GLIBCXX__) -typedef ::max_align_t - std_max_align_t; // libstdc++ forgot to add it to std:: for a while -#else -typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can - // *only* be accessed via std:: -#endif - -// Some platforms have incorrectly set max_align_t to a type with <8 bytes -// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit -// iOS). Work around this with our own union. See issue #64. -typedef union -{ - std_max_align_t x; - long long y; - void *z; -} max_align_t; -} // namespace details - -// Default traits for the ConcurrentQueue. To change some of the -// traits without re-implementing all of them, inherit from this -// struct and shadow the declarations you wish to be different; -// since the traits are used as a template type parameter, the -// shadowed declarations will be used where defined, and the defaults -// otherwise. -struct ConcurrentQueueDefaultTraits -{ - // General-purpose size type. std::size_t is strongly recommended. - typedef std::size_t size_t; - - // The type used for the enqueue and dequeue indices. Must be at least as - // large as size_t. Should be significantly larger than the number of - // elements you expect to hold at once, especially if you have a high - // turnover rate; for example, on 32-bit x86, if you expect to have over a - // hundred million elements or pump several million elements through your - // queue in a very short space of time, using a 32-bit type *may* trigger a - // race condition. A 64-bit int type is recommended in that case, and in - // practice will prevent a race condition no matter the usage of the queue. - // Note that whether the queue is lock-free with a 64-int type depends on - // the whether std::atomic is lock-free, which is - // platform-specific. - typedef std::size_t index_t; - - // Internally, all elements are enqueued and dequeued from multi-element - // blocks; this is the smallest controllable unit. If you expect few - // elements but many producers, a smaller block size should be favoured. For - // few producers and/or many elements, a larger block size is preferred. A - // sane default is provided. Must be a power of 2. - static const size_t BLOCK_SIZE = 32; - - // For explicit producers (i.e. when using a producer token), the block is - // checked for being empty by iterating through a list of flags, one per - // element. For large block sizes, this is too inefficient, and switching to - // an atomic counter-based approach is faster. The switch is made for block - // sizes strictly larger than this threshold. - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; - - // How many full blocks can be expected for a single explicit producer? This - // should reflect that number's maximum for optimal performance. Must be a - // power of 2. - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; - - // How many full blocks can be expected for a single implicit producer? This - // should reflect that number's maximum for optimal performance. Must be a - // power of 2. - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; - - // The initial size of the hash table mapping thread IDs to implicit - // producers. Note that the hash is resized every time it becomes half full. - // Must be a power of two, and either 0 or at least 1. If 0, implicit - // production (using the enqueue methods without an explicit producer token) - // is disabled. - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; - - // Controls the number of items that an explicit consumer (i.e. one with a - // token) must consume before it causes all consumers to rotate and move on - // to the next internal queue. - static const std::uint32_t - EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; - - // The maximum number of elements (inclusive) that can be enqueued to a - // sub-queue. Enqueue operations that would cause this limit to be surpassed - // will fail. Note that this limit is enforced at the block level (for - // performance reasons), i.e. it's rounded up to the nearest block size. - static const size_t MAX_SUBQUEUE_SIZE = - details::const_numeric_max::value; - - // The number of times to spin before sleeping when waiting on a semaphore. - // Recommended values are on the order of 1000-10000 unless the number of - // consumer threads exceeds the number of idle cores (in which case try - // 0-100). Only affects instances of the BlockingConcurrentQueue. - static const int MAX_SEMA_SPINS = 10000; - -#ifndef MCDBGQ_USE_RELACY - // Memory allocation can be customized if needed. - // malloc should return nullptr on failure, and handle alignment like - // std::malloc. -#if defined(malloc) || defined(free) - // Gah, this is 2015, stop defining macros that break standard code already! - // Work around malloc/free being special macros: - static inline void *WORKAROUND_malloc(size_t size) - { - return malloc(size); - } - static inline void WORKAROUND_free(void *ptr) - { - return free(ptr); - } - static inline void *(malloc) (size_t size) - { - return WORKAROUND_malloc(size); - } - static inline void(free)(void *ptr) - { - return WORKAROUND_free(ptr); - } -#else - static inline void *malloc(size_t size) - { - return std::malloc(size); - } - static inline void free(void *ptr) - { - return std::free(ptr); - } -#endif -#else - // Debug versions when running under the Relacy race detector (ignore - // these in user code) - static inline void *malloc(size_t size) - { - return rl::rl_malloc(size, $); - } - static inline void free(void *ptr) - { - return rl::rl_free(ptr, $); - } -#endif -}; - -// When producing or consuming many elements, the most efficient way is to: -// 1) Use one of the bulk-operation methods of the queue with a token -// 2) Failing that, use the bulk-operation methods without a token -// 3) Failing that, create a token and use that with the single-item methods -// 4) Failing that, use the single-parameter methods of the queue -// Having said that, don't create tokens willy-nilly -- ideally there should be -// a maximum of one token per thread (of each kind). -struct ProducerToken; -struct ConsumerToken; - -template -class ConcurrentQueue; -template -class BlockingConcurrentQueue; -class ConcurrentQueueTests; - -namespace details -{ -struct ConcurrentQueueProducerTypelessBase -{ - ConcurrentQueueProducerTypelessBase *next; - std::atomic inactive; - ProducerToken *token; - - ConcurrentQueueProducerTypelessBase() - : next(nullptr), inactive(false), token(nullptr) - { - } -}; - -template -struct _hash_32_or_64 -{ - static inline std::uint32_t hash(std::uint32_t h) - { - // MurmurHash3 finalizer -- see - // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp - // Since the thread ID is already unique, all we really want to do is - // propagate that uniqueness evenly across all the bits, so that we can - // use a subset of the bits while reducing collisions significantly - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - return h ^ (h >> 16); - } -}; -template <> -struct _hash_32_or_64<1> -{ - static inline std::uint64_t hash(std::uint64_t h) - { - h ^= h >> 33; - h *= 0xff51afd7ed558ccd; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53; - return h ^ (h >> 33); - } -}; -template -struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> -{ -}; - -static inline size_t hash_thread_id(thread_id_t id) -{ - static_assert( - sizeof(thread_id_t) <= 8, - "Expected a platform where thread IDs are at most 64-bit values"); - return static_cast( - hash_32_or_64::thread_id_hash_t)>:: - hash(thread_id_converter::prehash(id))); -} - -template -static inline bool circular_less_than(T a, T b) -{ -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4554) -#endif - static_assert( - std::is_integral::value && !std::numeric_limits::is_signed, - "circular_less_than is intended to be used only with unsigned integer " - "types"); - return static_cast(a - b) > - static_cast(static_cast(1) - << static_cast(sizeof(T) * CHAR_BIT - 1)); -#ifdef _MSC_VER -#pragma warning(pop) -#endif -} - -template -static inline char *align_for(char *ptr) -{ - const std::size_t alignment = std::alignment_of::value; - return ptr + - (alignment - (reinterpret_cast(ptr) % alignment)) % - alignment; -} - -template -static inline T ceil_to_pow_2(T x) -{ - static_assert( - std::is_integral::value && !std::numeric_limits::is_signed, - "ceil_to_pow_2 is intended to be used only with unsigned integer " - "types"); - - // Adapted from - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - for (std::size_t i = 1; i < sizeof(T); i <<= 1) - { - x |= x >> (i << 3); - } - ++x; - return x; -} - -template -static inline void swap_relaxed(std::atomic &left, std::atomic &right) -{ - T temp = std::move(left.load(std::memory_order_relaxed)); - left.store(std::move(right.load(std::memory_order_relaxed)), - std::memory_order_relaxed); - right.store(std::move(temp), std::memory_order_relaxed); -} - -template -static inline T const &nomove(T const &x) -{ - return x; -} - -template -struct nomove_if -{ - template - static inline T const &eval(T const &x) - { - return x; - } -}; - -template <> -struct nomove_if -{ - template - static inline auto eval(U &&x) -> decltype(std::forward(x)) - { - return std::forward(x); - } -}; - -template -static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT->decltype(*it) -{ - return *it; -} - -#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \ - (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) -template -struct is_trivially_destructible : std::is_trivially_destructible -{ -}; -#else -template -struct is_trivially_destructible : std::has_trivial_destructor -{ -}; -#endif - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED -#ifdef MCDBGQ_USE_RELACY -typedef RelacyThreadExitListener ThreadExitListener; -typedef RelacyThreadExitNotifier ThreadExitNotifier; -#else -struct ThreadExitListener -{ - typedef void (*callback_t)(void *); - callback_t callback; - void *userData; - - ThreadExitListener *next; // reserved for use by the ThreadExitNotifier -}; - -class ThreadExitNotifier -{ -public: - static void subscribe(ThreadExitListener *listener) - { - auto &tlsInst = instance(); - listener->next = tlsInst.tail; - tlsInst.tail = listener; - } - - static void unsubscribe(ThreadExitListener *listener) - { - auto &tlsInst = instance(); - ThreadExitListener **prev = &tlsInst.tail; - for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) - { - if (ptr == listener) - { - *prev = ptr->next; - break; - } - prev = &ptr->next; - } - } - -private: - ThreadExitNotifier() : tail(nullptr) - { - } - ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; - ThreadExitNotifier &operator=(ThreadExitNotifier const &) - MOODYCAMEL_DELETE_FUNCTION; - - ~ThreadExitNotifier() - { - // This thread is about to exit, let everyone know! - assert(this == &instance() && - "If this assert fails, you likely have a buggy compiler! Change " - "the preprocessor conditions such that " - "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); - for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) - { - ptr->callback(ptr->userData); - } - } - - // Thread-local - static inline ThreadExitNotifier &instance() - { - static thread_local ThreadExitNotifier notifier; - return notifier; - } - -private: - ThreadExitListener *tail; -}; -#endif -#endif - -template -struct static_is_lock_free_num -{ - enum - { - value = 0 - }; -}; -template <> -struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_CHAR_LOCK_FREE - }; -}; -template <> -struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_SHORT_LOCK_FREE - }; -}; -template <> -struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_INT_LOCK_FREE - }; -}; -template <> -struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_LONG_LOCK_FREE - }; -}; -template <> -struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_LLONG_LOCK_FREE - }; -}; -template -struct static_is_lock_free - : static_is_lock_free_num::type> -{ -}; -template <> -struct static_is_lock_free -{ - enum - { - value = ATOMIC_BOOL_LOCK_FREE - }; -}; -template -struct static_is_lock_free -{ - enum - { - value = ATOMIC_POINTER_LOCK_FREE - }; -}; -} // namespace details - -struct ProducerToken -{ - template - explicit ProducerToken(ConcurrentQueue &queue); - - template - explicit ProducerToken(BlockingConcurrentQueue &queue); - - ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT - : producer(other.producer) - { - other.producer = nullptr; - if (producer != nullptr) - { - producer->token = this; - } - } - - inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT - { - std::swap(producer, other.producer); - if (producer != nullptr) - { - producer->token = this; - } - if (other.producer != nullptr) - { - other.producer->token = &other; - } - } - - // A token is always valid unless: - // 1) Memory allocation failed during construction - // 2) It was moved via the move constructor - // (Note: assignment does a swap, leaving both potentially valid) - // 3) The associated queue was destroyed - // Note that if valid() returns true, that only indicates - // that the token is valid for use with a specific queue, - // but not which one; that's up to the user to track. - inline bool valid() const - { - return producer != nullptr; - } - - ~ProducerToken() - { - if (producer != nullptr) - { - producer->token = nullptr; - producer->inactive.store(true, std::memory_order_release); - } - } - - // Disable copying and assignment - ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; - ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; - -private: - template - friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - -protected: - details::ConcurrentQueueProducerTypelessBase *producer; -}; - -struct ConsumerToken -{ - template - explicit ConsumerToken(ConcurrentQueue &q); - - template - explicit ConsumerToken(BlockingConcurrentQueue &q); - - ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT - : initialOffset(other.initialOffset), - lastKnownGlobalOffset(other.lastKnownGlobalOffset), - itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), - currentProducer(other.currentProducer), - desiredProducer(other.desiredProducer) - { - } - - inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT - { - std::swap(initialOffset, other.initialOffset); - std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); - std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); - std::swap(currentProducer, other.currentProducer); - std::swap(desiredProducer, other.desiredProducer); - } - - // Disable copying and assignment - ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; - ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; - -private: - template - friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - -private: // but shared with ConcurrentQueue - std::uint32_t initialOffset; - std::uint32_t lastKnownGlobalOffset; - std::uint32_t itemsConsumedFromCurrent; - details::ConcurrentQueueProducerTypelessBase *currentProducer; - details::ConcurrentQueueProducerTypelessBase *desiredProducer; -}; - -// Need to forward-declare this swap because it's in a namespace. -// See -// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, - typename ConcurrentQueue::ImplicitProducerKVP &b) - MOODYCAMEL_NOEXCEPT; - -template -class ConcurrentQueue -{ -public: - typedef ::moodycamel::ProducerToken producer_token_t; - typedef ::moodycamel::ConsumerToken consumer_token_t; - - typedef typename Traits::index_t index_t; - typedef typename Traits::size_t size_t; - - static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = - static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = - static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = - static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = - static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); - static const std::uint32_t - EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = - static_cast( - Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4307) // + integral constant overflow (that's what - // the ternary expression is for!) -#pragma warning(disable : 4309) // static_cast: Truncation of constant value -#endif - static const size_t MAX_SUBQUEUE_SIZE = - (details::const_numeric_max::value - - static_cast(Traits::MAX_SUBQUEUE_SIZE) < - BLOCK_SIZE) - ? details::const_numeric_max::value - : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + - (BLOCK_SIZE - 1)) / - BLOCK_SIZE * BLOCK_SIZE); -#ifdef _MSC_VER -#pragma warning(pop) -#endif - - static_assert(!std::numeric_limits::is_signed && - std::is_integral::value, - "Traits::size_t must be an unsigned integral type"); - static_assert(!std::numeric_limits::is_signed && - std::is_integral::value, - "Traits::index_t must be an unsigned integral type"); - static_assert(sizeof(index_t) >= sizeof(size_t), - "Traits::index_t must be at least as wide as Traits::size_t"); - static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), - "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); - static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && - !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & - (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), - "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " - "power of 2 (and greater than 1)"); - static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && - !(EXPLICIT_INITIAL_INDEX_SIZE & - (EXPLICIT_INITIAL_INDEX_SIZE - 1)), - "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " - "(and greater than 1)"); - static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && - !(IMPLICIT_INITIAL_INDEX_SIZE & - (IMPLICIT_INITIAL_INDEX_SIZE - 1)), - "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " - "(and greater than 1)"); - static_assert( - (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || - !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & - (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), - "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); - static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || - INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, - "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at " - "least 1 (or 0 to disable implicit enqueueing)"); - -public: - // Creates a queue with at least `capacity` element slots; note that the - // actual number of elements that can be inserted without additional memory - // allocation depends on the number of producers and the block size (e.g. if - // the block size is equal to `capacity`, only a single block will be - // allocated up-front, which means only a single producer will be able to - // enqueue elements without an extra allocation -- blocks aren't shared - // between producers). This method is not thread safe -- it is up to the - // user to ensure that the queue is fully constructed before it starts being - // used by other threads (this includes making the memory effects of - // construction visible, possibly with a memory barrier). - explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - populate_initial_block_list( - capacity / BLOCK_SIZE + - ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - // Track all the producers using a fully-resolved typed list for - // each kind; this makes it possible to debug them starting from - // the root queue object (otherwise wacky casts are needed that - // don't compile in the debugger's expression evaluator). - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Computes the correct amount of pre-allocated blocks for you based - // on the minimum number of elements you want available at any given - // time, and the maximum concurrent number of each type of producer. - ConcurrentQueue(size_t minCapacity, - size_t maxExplicitProducers, - size_t maxImplicitProducers) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * - (maxExplicitProducers + 1) + - 2 * (maxExplicitProducers + maxImplicitProducers); - populate_initial_block_list(blocks); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Note: The queue should not be accessed concurrently while it's - // being deleted. It's up to the user to synchronize this. - // This method is not thread safe. - ~ConcurrentQueue() - { - // Destroy producers - auto ptr = producerListTail.load(std::memory_order_relaxed); - while (ptr != nullptr) - { - auto next = ptr->next_prod(); - if (ptr->token != nullptr) - { - ptr->token->producer = nullptr; - } - destroy(ptr); - ptr = next; - } - - // Destroy implicit producer hash tables - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) - { - auto hash = implicitProducerHash.load(std::memory_order_relaxed); - while (hash != nullptr) - { - auto prev = hash->prev; - if (prev != nullptr) - { // The last hash is part of this object and was not allocated - // dynamically - for (size_t i = 0; i != hash->capacity; ++i) - { - hash->entries[i].~ImplicitProducerKVP(); - } - hash->~ImplicitProducerHash(); - (Traits::free)(hash); - } - hash = prev; - } - } - - // Destroy global free list - auto block = freeList.head_unsafe(); - while (block != nullptr) - { - auto next = block->freeListNext.load(std::memory_order_relaxed); - if (block->dynamicallyAllocated) - { - destroy(block); - } - block = next; - } - - // Destroy initial free list - destroy_array(initialBlockPool, initialBlockPoolSize); - } - - // Disable copying and copy assignment - ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; - ConcurrentQueue &operator=(ConcurrentQueue const &) - MOODYCAMEL_DELETE_FUNCTION; - - // Moving is supported, but note that it is *not* a thread-safe operation. - // Nobody can use the queue while it's being moved, and the memory effects - // of that move must be propagated to other threads before they can use it. - // Note: When a queue is moved, its tokens are still valid but can only be - // used with the destination queue (i.e. semantically they are moved along - // with the queue itself). - ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT - : producerListTail( - other.producerListTail.load(std::memory_order_relaxed)), - producerCount(other.producerCount.load(std::memory_order_relaxed)), - initialBlockPoolIndex( - other.initialBlockPoolIndex.load(std::memory_order_relaxed)), - initialBlockPool(other.initialBlockPool), - initialBlockPoolSize(other.initialBlockPoolSize), - freeList(std::move(other.freeList)), - nextExplicitConsumerId( - other.nextExplicitConsumerId.load(std::memory_order_relaxed)), - globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load( - std::memory_order_relaxed)) - { - // Move the other one into this, and leave the other one as an empty - // queue - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - swap_implicit_producer_hashes(other); - - other.producerListTail.store(nullptr, std::memory_order_relaxed); - other.producerCount.store(0, std::memory_order_relaxed); - other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); - other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store( - other.explicitProducers.load(std::memory_order_relaxed), - std::memory_order_relaxed); - other.explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store( - other.implicitProducers.load(std::memory_order_relaxed), - std::memory_order_relaxed); - other.implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - - other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); - other.initialBlockPoolSize = 0; - other.initialBlockPool = nullptr; - - reown_producers(); - } - - inline ConcurrentQueue &operator=(ConcurrentQueue &&other) - MOODYCAMEL_NOEXCEPT - { - return swap_internal(other); - } - - // Swaps this queue's state with the other's. Not thread-safe. - // Swapping two queues does not invalidate their tokens, however - // the tokens that were created for one queue must be used with - // only the swapped queue (i.e. the tokens are tied to the - // queue's movable state, not the object itself). - inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT - { - swap_internal(other); - } - -private: - ConcurrentQueue &swap_internal(ConcurrentQueue &other) - { - if (this == &other) - { - return *this; - } - - details::swap_relaxed(producerListTail, other.producerListTail); - details::swap_relaxed(producerCount, other.producerCount); - details::swap_relaxed(initialBlockPoolIndex, - other.initialBlockPoolIndex); - std::swap(initialBlockPool, other.initialBlockPool); - std::swap(initialBlockPoolSize, other.initialBlockPoolSize); - freeList.swap(other.freeList); - details::swap_relaxed(nextExplicitConsumerId, - other.nextExplicitConsumerId); - details::swap_relaxed(globalExplicitConsumerOffset, - other.globalExplicitConsumerOffset); - - swap_implicit_producer_hashes(other); - - reown_producers(); - other.reown_producers(); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - details::swap_relaxed(explicitProducers, other.explicitProducers); - details::swap_relaxed(implicitProducers, other.implicitProducers); -#endif - - return *this; - } - -public: - // Enqueues a single item (by copying it). - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because - // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T const &item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because - // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T &&item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const &token, T const &item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit - // producer token. Allocates memory if required. Only fails if memory - // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would - // be surpassed). Thread-safe. - inline bool enqueue(producer_token_t const &token, T &&item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because - // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note: - // Use std::make_move_iterator if the elements should be moved instead of - // copied. Thread-safe. - template - bool enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails - // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - // Enqueues a single item (by copying it). - // Does not allocate memory. Fails if not enough room to enqueue (or - // implicit production is disabled because - // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). Thread-safe. - inline bool try_enqueue(T const &item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Thread-safe. - inline bool try_enqueue(T &&item) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const &token, T const &item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit - // producer token. Does not allocate memory. Fails if not enough room to - // enqueue. Thread-safe. - inline bool try_enqueue(producer_token_t const &token, T &&item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(producer_token_t const &token, - It itemFirst, - size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(U &item) - { - // Instead of simply trying each producer in turn (which could cause - // needless contention on the first producer), we score them - // heuristically. - size_t nonEmptyCount = 0; - ProducerBase *best = nullptr; - size_t bestSize = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); - nonEmptyCount < 3 && ptr != nullptr; - ptr = ptr->next_prod()) - { - auto size = ptr->size_approx(); - if (size > 0) - { - if (size > bestSize) - { - bestSize = size; - best = ptr; - } - ++nonEmptyCount; - } - } - - // If there was at least one non-empty queue but it appears empty at the - // time we try to dequeue from it, we need to make sure every queue's - // been tried - if (nonEmptyCount > 0) - { - if ((details::likely)(best->dequeue(item))) - { - return true; - } - for (auto ptr = producerListTail.load(std::memory_order_acquire); - ptr != nullptr; - ptr = ptr->next_prod()) - { - if (ptr != best && ptr->dequeue(item)) - { - return true; - } - } - } - return false; - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // This differs from the try_dequeue(item) method in that this one does - // not attempt to reduce contention by interleaving the order that producer - // streams are dequeued from. So, using this method can reduce overall - // throughput under contention, but will give more predictable results in - // single-threaded consumer scenarios. This is mostly only useful for - // internal unit tests. Never allocates. Thread-safe. - template - bool try_dequeue_non_interleaved(U &item) - { - for (auto ptr = producerListTail.load(std::memory_order_acquire); - ptr != nullptr; - ptr = ptr->next_prod()) - { - if (ptr->dequeue(item)) - { - return true; - } - } - return false; - } - - // Attempts to dequeue from the queue using an explicit consumer token. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(consumer_token_t &token, U &item) - { - // The idea is roughly as follows: - // Every 256 items from one producer, make everyone rotate (increase the - // global offset) -> this means the highest efficiency consumer dictates - // the rotation speed of everyone else, more or less If you see that the - // global offset has changed, you must reset your consumption counter - // and move to your designated place If there's no items where you're - // supposed to be, keep moving until you find a producer with some items - // If the global offset has not changed but you've run out of items to - // consume, move over from your current position until you find an - // producer with something in it - - if (token.desiredProducer == nullptr || - token.lastKnownGlobalOffset != - globalExplicitConsumerOffset.load(std::memory_order_relaxed)) - { - if (!update_current_producer_after_rotation(token)) - { - return false; - } - } - - // If there was at least one non-empty queue but it appears empty at the - // time we try to dequeue from it, we need to make sure every queue's - // been tried - if (static_cast(token.currentProducer)->dequeue(item)) - { - if (++token.itemsConsumedFromCurrent == - EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) - { - globalExplicitConsumerOffset.fetch_add( - 1, std::memory_order_relaxed); - } - return true; - } - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = - static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) - { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) - { - if (ptr->dequeue(item)) - { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = 1; - return true; - } - ptr = ptr->next_prod(); - if (ptr == nullptr) - { - ptr = tail; - } - } - return false; - } - - // Attempts to dequeue several elements from the queue. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(It itemFirst, size_t max) - { - size_t count = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); - ptr != nullptr; - ptr = ptr->next_prod()) - { - count += ptr->dequeue_bulk(itemFirst, max - count); - if (count == max) - { - break; - } - } - return count; - } - - // Attempts to dequeue several elements from the queue using an explicit - // consumer token. Returns the number of items actually dequeued. Returns 0 - // if all producer streams appeared empty at the time they were checked (so, - // the queue is likely but not guaranteed to be empty). Never allocates. - // Thread-safe. - template - size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) - { - if (token.desiredProducer == nullptr || - token.lastKnownGlobalOffset != - globalExplicitConsumerOffset.load(std::memory_order_relaxed)) - { - if (!update_current_producer_after_rotation(token)) - { - return 0; - } - } - - size_t count = static_cast(token.currentProducer) - ->dequeue_bulk(itemFirst, max); - if (count == max) - { - if ((token.itemsConsumedFromCurrent += static_cast( - max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) - { - globalExplicitConsumerOffset.fetch_add( - 1, std::memory_order_relaxed); - } - return max; - } - token.itemsConsumedFromCurrent += static_cast(count); - max -= count; - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = - static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) - { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) - { - auto dequeued = ptr->dequeue_bulk(itemFirst, max); - count += dequeued; - if (dequeued != 0) - { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = - static_cast(dequeued); - } - if (dequeued == max) - { - break; - } - max -= dequeued; - ptr = ptr->next_prod(); - if (ptr == nullptr) - { - ptr = tail; - } - } - return count; - } - - // Attempts to dequeue from a specific producer's inner queue. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns false if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline bool try_dequeue_from_producer(producer_token_t const &producer, - U &item) - { - return static_cast(producer.producer) - ->dequeue(item); - } - - // Attempts to dequeue several elements from a specific producer's inner - // queue. Returns the number of items actually dequeued. If you happen to - // know which producer you want to dequeue from, this is significantly - // faster than using the general-case try_dequeue methods. Returns 0 if the - // producer's queue appeared empty at the time it was checked (so, the queue - // is likely but not guaranteed to be empty). Never allocates. Thread-safe. - template - inline size_t try_dequeue_bulk_from_producer( - producer_token_t const &producer, It itemFirst, size_t max) - { - return static_cast(producer.producer) - ->dequeue_bulk(itemFirst, max); - } - - // Returns an estimate of the total number of elements currently in the - // queue. This estimate is only accurate if the queue has completely - // stabilized before it is called (i.e. all enqueue and dequeue operations - // have completed and their memory effects are visible on the calling - // thread, and no further operations start while this method is being - // called). Thread-safe. - size_t size_approx() const - { - size_t size = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); - ptr != nullptr; - ptr = ptr->next_prod()) - { - size += ptr->size_approx(); - } - return size; - } - - bool is_empty() const - { - for (auto ptr = producerListTail.load(std::memory_order_acquire); - ptr != nullptr; - ptr = ptr->next_prod()) - { - if (ptr->size_approx() > 0) - { - return false; - } - } - - return true; - } - - // Returns true if the underlying atomic variables used by - // the queue are lock-free (they should be on most platforms). - // Thread-safe. - static bool is_lock_free() - { - return details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free< - typename details::thread_id_converter:: - thread_id_numeric_size_t>::value == 2; - } - -private: - friend struct ProducerToken; - friend struct ConsumerToken; - struct ExplicitProducer; - friend struct ExplicitProducer; - struct ImplicitProducer; - friend struct ImplicitProducer; - friend class ConcurrentQueueTests; - - enum AllocationMode - { - CanAlloc, - CannotAlloc - }; - - /////////////////////////////// - // Queue methods - /////////////////////////////// - - template - inline bool inner_enqueue(producer_token_t const &token, U &&element) - { - return static_cast(token.producer) - ->ConcurrentQueue::ExplicitProducer::template enqueue( - std::forward(element)); - } - - template - inline bool inner_enqueue(U &&element) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr - ? false - : producer->ConcurrentQueue::ImplicitProducer:: - template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue_bulk(producer_token_t const &token, - It itemFirst, - size_t count) - { - return static_cast(token.producer) - ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk< - canAlloc>(itemFirst, count); - } - - template - inline bool inner_enqueue_bulk(It itemFirst, size_t count) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr - ? false - : producer->ConcurrentQueue::ImplicitProducer:: - template enqueue_bulk(itemFirst, count); - } - - inline bool update_current_producer_after_rotation(consumer_token_t &token) - { - // Ah, there's been a rotation, figure out where we should be! - auto tail = producerListTail.load(std::memory_order_acquire); - if (token.desiredProducer == nullptr && tail == nullptr) - { - return false; - } - auto prodCount = producerCount.load(std::memory_order_relaxed); - auto globalOffset = - globalExplicitConsumerOffset.load(std::memory_order_relaxed); - if ((details::unlikely)(token.desiredProducer == nullptr)) - { - // Aha, first time we're dequeueing anything. - // Figure out our local position - // Note: offset is from start, not end, but we're traversing from - // end -- subtract from count first - std::uint32_t offset = - prodCount - 1 - (token.initialOffset % prodCount); - token.desiredProducer = tail; - for (std::uint32_t i = 0; i != offset; ++i) - { - token.desiredProducer = - static_cast(token.desiredProducer) - ->next_prod(); - if (token.desiredProducer == nullptr) - { - token.desiredProducer = tail; - } - } - } - - std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; - if (delta >= prodCount) - { - delta = delta % prodCount; - } - for (std::uint32_t i = 0; i != delta; ++i) - { - token.desiredProducer = - static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) - { - token.desiredProducer = tail; - } - } - - token.lastKnownGlobalOffset = globalOffset; - token.currentProducer = token.desiredProducer; - token.itemsConsumedFromCurrent = 0; - return true; - } - - /////////////////////////// - // Free list - /////////////////////////// - - template - struct FreeListNode - { - FreeListNode() : freeListRefs(0), freeListNext(nullptr) - { - } - - std::atomic freeListRefs; - std::atomic freeListNext; - }; - - // A simple CAS-based lock-free free list. Not the fastest thing in the - // world under heavy contention, but simple and correct (assuming nodes are - // never freed until after the free list is destroyed), and fairly speedy - // under low contention. - template // N must inherit FreeListNode or have the same - // fields (and initialization of them) - struct FreeList - { - FreeList() : freeListHead(nullptr) - { - } - FreeList(FreeList &&other) - : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) - { - other.freeListHead.store(nullptr, std::memory_order_relaxed); - } - void swap(FreeList &other) - { - details::swap_relaxed(freeListHead, other.freeListHead); - } - - FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; - FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; - - inline void add(N *node) - { -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - // We know that the should-be-on-freelist bit is 0 at this point, so - // it's safe to set it using a fetch_add - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, - std::memory_order_acq_rel) == 0) - { - // Oh look! We were the last ones referencing this node, and we - // know we want to add it to the free list, so let's do it! - add_knowing_refcount_is_zero(node); - } - } - - inline N *try_get() - { -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - auto head = freeListHead.load(std::memory_order_acquire); - while (head != nullptr) - { - auto prevHead = head; - auto refs = head->freeListRefs.load(std::memory_order_relaxed); - if ((refs & REFS_MASK) == 0 || - !head->freeListRefs.compare_exchange_strong( - refs, - refs + 1, - std::memory_order_acquire, - std::memory_order_relaxed)) - { - head = freeListHead.load(std::memory_order_acquire); - continue; - } - - // Good, reference count has been incremented (it wasn't at - // zero), which means we can read the next and not worry about - // it changing between now and the time we do the CAS - auto next = head->freeListNext.load(std::memory_order_relaxed); - if (freeListHead.compare_exchange_strong( - head, - next, - std::memory_order_acquire, - std::memory_order_relaxed)) - { - // Yay, got the node. This means it was on the list, which - // means shouldBeOnFreeList must be false no matter the - // refcount (because nobody else knows it's been taken off - // yet, it can't have been put back on). - assert((head->freeListRefs.load(std::memory_order_relaxed) & - SHOULD_BE_ON_FREELIST) == 0); - - // Decrease refcount twice, once for our ref, and once for - // the list's ref - head->freeListRefs.fetch_sub(2, std::memory_order_release); - return head; - } - - // OK, the head must have changed on us, but we still need to - // decrease the refcount we increased. Note that we don't need - // to release any memory effects, but we do need to ensure that - // the reference count decrement happens-after the CAS on the - // head. - refs = prevHead->freeListRefs.fetch_sub( - 1, std::memory_order_acq_rel); - if (refs == SHOULD_BE_ON_FREELIST + 1) - { - add_knowing_refcount_is_zero(prevHead); - } - } - - return nullptr; - } - - // Useful for traversing the list when there's no contention (e.g. to - // destroy remaining nodes) - N *head_unsafe() const - { - return freeListHead.load(std::memory_order_relaxed); - } - - private: - inline void add_knowing_refcount_is_zero(N *node) - { - // Since the refcount is zero, and nobody can increase it once it's - // zero (except us, and we run only one copy of this method per node - // at a time, i.e. the single thread case), then we know we can - // safely change the next pointer of the node; however, once the - // refcount is back above zero, then other threads could increase it - // (happens under heavy contention, when the refcount goes to zero - // in between a load and a refcount increment of a node in try_get, - // then back up to something non-zero, then the refcount increment - // is done by the other thread) -- so, if the CAS to add the node to - // the actual list fails, decrease the refcount and leave the add - // operation to the next thread who puts the refcount back at zero - // (which could be us, hence the loop). - auto head = freeListHead.load(std::memory_order_relaxed); - while (true) - { - node->freeListNext.store(head, std::memory_order_relaxed); - node->freeListRefs.store(1, std::memory_order_release); - if (!freeListHead.compare_exchange_strong( - head, - node, - std::memory_order_release, - std::memory_order_relaxed)) - { - // Hmm, the add failed, but we can only try again when the - // refcount goes back to zero - if (node->freeListRefs.fetch_add( - SHOULD_BE_ON_FREELIST - 1, - std::memory_order_release) == 1) - { - continue; - } - } - return; - } - } - - private: - // Implemented like a stack, but where node order doesn't matter (nodes - // are inserted out of order under contention) - std::atomic freeListHead; - - static const std::uint32_t REFS_MASK = 0x7FFFFFFF; - static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; - -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugMutex mutex; -#endif - }; - - /////////////////////////// - // Block - /////////////////////////// - - enum InnerQueueContext - { - implicit_context = 0, - explicit_context = 1 - }; - - struct Block - { - Block() - : next(nullptr), - elementsCompletelyDequeued(0), - freeListRefs(0), - freeListNext(nullptr), - shouldBeOnFreeList(false), - dynamicallyAllocated(true) - { -#ifdef MCDBGQ_TRACKMEM - owner = nullptr; -#endif - } - - template - inline bool is_empty() const - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && - BLOCK_SIZE <= - EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Check flags - for (size_t i = 0; i < BLOCK_SIZE; ++i) - { - if (!emptyFlags[i].load(std::memory_order_relaxed)) - { - return false; - } - } - - // Aha, empty; make sure we have all other memory effects that - // happened before the empty flags were set - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - else - { - // Check counter - if (elementsCompletelyDequeued.load( - std::memory_order_relaxed) == BLOCK_SIZE) - { - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - assert(elementsCompletelyDequeued.load( - std::memory_order_relaxed) <= BLOCK_SIZE); - return false; - } - } - - // Returns true if the block is now empty (does not apply in explicit - // context) - template - inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && - BLOCK_SIZE <= - EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Set flag - assert(!emptyFlags[BLOCK_SIZE - 1 - - static_cast(i & static_cast( - BLOCK_SIZE - 1))] - .load(std::memory_order_relaxed)); - emptyFlags[BLOCK_SIZE - 1 - - static_cast( - i & static_cast(BLOCK_SIZE - 1))] - .store(true, std::memory_order_release); - return false; - } - else - { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add( - 1, std::memory_order_release); - assert(prevVal < BLOCK_SIZE); - return prevVal == BLOCK_SIZE - 1; - } - } - - // Sets multiple contiguous item statuses to 'empty' (assumes no - // wrapping and count > 0). Returns true if the block is now empty (does - // not apply in explicit context). - template - inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, - size_t count) - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && - BLOCK_SIZE <= - EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Set flags - std::atomic_thread_fence(std::memory_order_release); - i = BLOCK_SIZE - 1 - - static_cast(i & - static_cast(BLOCK_SIZE - 1)) - - count + 1; - for (size_t j = 0; j != count; ++j) - { - assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); - emptyFlags[i + j].store(true, std::memory_order_relaxed); - } - return false; - } - else - { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add( - count, std::memory_order_release); - assert(prevVal + count <= BLOCK_SIZE); - return prevVal + count == BLOCK_SIZE; - } - } - - template - inline void set_all_empty() - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && - BLOCK_SIZE <= - EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Set all flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) - { - emptyFlags[i].store(true, std::memory_order_relaxed); - } - } - else - { - // Reset counter - elementsCompletelyDequeued.store(BLOCK_SIZE, - std::memory_order_relaxed); - } - } - - template - inline void reset_empty() - { - MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && - BLOCK_SIZE <= - EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) - { - // Reset flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) - { - emptyFlags[i].store(false, std::memory_order_relaxed); - } - } - else - { - // Reset counter - elementsCompletelyDequeued.store(0, std::memory_order_relaxed); - } - } - - inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT - { - return static_cast(static_cast(elements)) + - static_cast(idx & - static_cast(BLOCK_SIZE - 1)); - } - inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT - { - return static_cast(static_cast(elements)) + - static_cast(idx & - static_cast(BLOCK_SIZE - 1)); - } - - private: - static_assert(std::alignment_of::value <= sizeof(T), - "The queue does not support types with an alignment " - "greater than their size at this time"); - MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; - - public: - Block *next; - std::atomic elementsCompletelyDequeued; - std::atomic - emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - ? BLOCK_SIZE - : 1]; - - public: - std::atomic freeListRefs; - std::atomic freeListNext; - std::atomic shouldBeOnFreeList; - bool dynamicallyAllocated; // Perhaps a better name for this would be - // 'isNotPartOfInitialBlockPool' - -#ifdef MCDBGQ_TRACKMEM - void *owner; -#endif - }; - static_assert(std::alignment_of::value >= - std::alignment_of::value, - "Internal error: Blocks must be at least as aligned as the " - "type they are wrapping"); - -#ifdef MCDBGQ_TRACKMEM -public: - struct MemStats; - -private: -#endif - - /////////////////////////// - // Producer base - /////////////////////////// - - struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase - { - ProducerBase(ConcurrentQueue *parent_, bool isExplicit_) - : tailIndex(0), - headIndex(0), - dequeueOptimisticCount(0), - dequeueOvercommit(0), - tailBlock(nullptr), - isExplicit(isExplicit_), - parent(parent_) - { - } - - virtual ~ProducerBase() - { - } - - template - inline bool dequeue(U &element) - { - if (isExplicit) - { - return static_cast(this)->dequeue(element); - } - else - { - return static_cast(this)->dequeue(element); - } - } - - template - inline size_t dequeue_bulk(It &itemFirst, size_t max) - { - if (isExplicit) - { - return static_cast(this)->dequeue_bulk( - itemFirst, max); - } - else - { - return static_cast(this)->dequeue_bulk( - itemFirst, max); - } - } - - inline ProducerBase *next_prod() const - { - return static_cast(next); - } - - inline size_t size_approx() const - { - auto tail = tailIndex.load(std::memory_order_relaxed); - auto head = headIndex.load(std::memory_order_relaxed); - return details::circular_less_than(head, tail) - ? static_cast(tail - head) - : 0; - } - - inline index_t getTail() const - { - return tailIndex.load(std::memory_order_relaxed); - } - - protected: - std::atomic tailIndex; // Where to enqueue to next - std::atomic headIndex; // Where to dequeue from next - - std::atomic dequeueOptimisticCount; - std::atomic dequeueOvercommit; - - Block *tailBlock; - - public: - bool isExplicit; - ConcurrentQueue *parent; - - protected: -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - /////////////////////////// - // Explicit queue - /////////////////////////// - - struct ExplicitProducer : public ProducerBase - { - explicit ExplicitProducer(ConcurrentQueue *parent_) - : ProducerBase(parent_, true), - blockIndex(nullptr), - pr_blockIndexSlotsUsed(0), - pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), - pr_blockIndexFront(0), - pr_blockIndexEntries(nullptr), - pr_blockIndexRaw(nullptr) - { - size_t poolBasedIndexSize = - details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; - if (poolBasedIndexSize > pr_blockIndexSize) - { - pr_blockIndexSize = poolBasedIndexSize; - } - - new_block_index( - 0); // This creates an index with double the number of current - // entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE - } - - ~ExplicitProducer() - { - // Destruct any elements not yet dequeued. - // Since we're in the destructor, we can assume all elements - // are either completely dequeued or completely not (no halfways). - if (this->tailBlock != nullptr) - { // Note this means there must be a block index too - // First find the block that's partially dequeued, if any - Block *halfDequeuedBlock = nullptr; - if ((this->headIndex.load(std::memory_order_relaxed) & - static_cast(BLOCK_SIZE - 1)) != 0) - { - // The head's not on a block boundary, meaning a block - // somewhere is partially dequeued (or the head block is the - // tail block and was fully dequeued, but the head/tail are - // still not on a boundary) - size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & - (pr_blockIndexSize - 1); - while (details::circular_less_than( - pr_blockIndexEntries[i].base + BLOCK_SIZE, - this->headIndex.load(std::memory_order_relaxed))) - { - i = (i + 1) & (pr_blockIndexSize - 1); - } - assert(details::circular_less_than( - pr_blockIndexEntries[i].base, - this->headIndex.load(std::memory_order_relaxed))); - halfDequeuedBlock = pr_blockIndexEntries[i].block; - } - - // Start at the head block (note the first line in the loop - // gives us the head from the tail on the first iteration) - auto block = this->tailBlock; - do - { - block = block->next; - if (block->ConcurrentQueue::Block::template is_empty< - explicit_context>()) - { - continue; - } - - size_t i = 0; // Offset into block - if (block == halfDequeuedBlock) - { - i = static_cast( - this->headIndex.load(std::memory_order_relaxed) & - static_cast(BLOCK_SIZE - 1)); - } - - // Walk through all the items in the block; if this is the - // tail block, we need to stop when we reach the tail index - auto lastValidIndex = - (this->tailIndex.load(std::memory_order_relaxed) & - static_cast(BLOCK_SIZE - 1)) == 0 - ? BLOCK_SIZE - : static_cast( - this->tailIndex.load( - std::memory_order_relaxed) & - static_cast(BLOCK_SIZE - 1)); - while (i != BLOCK_SIZE && - (block != this->tailBlock || i != lastValidIndex)) - { - (*block)[i++]->~T(); - } - } while (block != this->tailBlock); - } - - // Destroy all blocks that we own - if (this->tailBlock != nullptr) - { - auto block = this->tailBlock; - do - { - auto nextBlock = block->next; - if (block->dynamicallyAllocated) - { - destroy(block); - } - else - { - this->parent->add_block_to_free_list(block); - } - block = nextBlock; - } while (block != this->tailBlock); - } - - // Destroy the block indices - auto header = static_cast(pr_blockIndexRaw); - while (header != nullptr) - { - auto prev = static_cast(header->prev); - header->~BlockIndexHeader(); - (Traits::free)(header); - header = prev; - } - } - - template - inline bool enqueue(U &&element) - { - index_t currentTailIndex = - this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) - { - // We reached the end of a block, start a new one - auto startBlock = this->tailBlock; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - if (this->tailBlock != nullptr && - this->tailBlock->next->ConcurrentQueue::Block:: - template is_empty()) - { - // We can re-use the block ahead of us, it's empty! - this->tailBlock = this->tailBlock->next; - this->tailBlock->ConcurrentQueue::Block:: - template reset_empty(); - - // We'll put the block on the block index (guaranteed to be - // room since we're conceptually removing the last block - // from it first -- except instead of removing then adding, - // we can just overwrite). Note that there must be a valid - // block index here, since even if allocation failed in the - // ctor, it would have been re-attempted when adding the - // first block to the queue; since there is such a block, a - // block index must have been successfully allocated. - } - else - { - // Whatever head value we see here is >= the last value we - // saw here (relatively), and <= its current value. Since we - // have the most recent tail, the head must be - // <= to it. - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than( - currentTailIndex, head)); - if (!details::circular_less_than( - head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != - details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || - MAX_SUBQUEUE_SIZE - BLOCK_SIZE < - currentTailIndex - head))) - { - // We can't enqueue in another block because there's not - // enough leeway -- the tail could surpass the head by - // the time the block fills up! (Or we'll exceed the - // size limit, if the second part of the condition was - // true.) - return false; - } - // We're going to need a new block; check that the block - // index has room - if (pr_blockIndexRaw == nullptr || - pr_blockIndexSlotsUsed == pr_blockIndexSize) - { - // Hmm, the circular block index is already full -- - // we'll need to allocate a new index. Note - // pr_blockIndexRaw can only be nullptr if the initial - // allocation failed in the constructor. - - MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) - { - return false; - } - else if (!new_block_index(pr_blockIndexSlotsUsed)) - { - return false; - } - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue:: - template requisition_block(); - if (newBlock == nullptr) - { - return false; - } -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty< - explicit_context>(); - if (this->tailBlock == nullptr) - { - newBlock->next = newBlock; - } - else - { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - ++pr_blockIndexSlotsUsed; - } - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, - U, - new (static_cast(nullptr)) - T(std::forward(element)))) - { - // The constructor may throw. We want the element not to - // appear in the queue in that case (without corrupting the - // queue): - MOODYCAMEL_TRY - { - new ((*this->tailBlock)[currentTailIndex]) - T(std::forward(element)); - } - MOODYCAMEL_CATCH(...) - { - // Revert change to the current block, but leave the new - // block available for next time - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? this->tailBlock - : startBlock; - MOODYCAMEL_RETHROW; - } - } - else - { - (void) startBlock; - (void) originalBlockIndexSlotsUsed; - } - - // Add block to block index - auto &entry = blockIndex.load(std::memory_order_relaxed) - ->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - blockIndex.load(std::memory_order_relaxed) - ->front.store(pr_blockIndexFront, - std::memory_order_release); - pr_blockIndexFront = - (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, - U, - new (static_cast(nullptr)) - T(std::forward(element)))) - { - this->tailIndex.store(newTailIndex, - std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) - T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U &element) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = - this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than( - this->dequeueOptimisticCount.load( - std::memory_order_relaxed) - - overcommit, - tail)) - { - // Might be something to dequeue, let's give it a try - - // Note that this if is purely for performance purposes in the - // common case when the queue is empty and the values are - // eventually consistent -- we may enter here spuriously. - - // Note that whatever the values of overcommit and tail are, - // they are not going to change (unless we change them) and must - // be the same value at this point (inside the if) as when the - // if condition was evaluated. - - // We insert an acquire fence here to synchronize-with the - // release upon incrementing dequeueOvercommit below. This - // ensures that whatever the value we got loaded into - // overcommit, the load of dequeueOptisticCount in the fetch_add - // below will result in a value at least as recent as that (and - // therefore at least as large). Note that I believe a compiler - // (signal) fence here would be sufficient due to the nature of - // fetch_add (all read-modify-write operations are guaranteed to - // work on the latest value in the modification order), but - // unfortunately that can't be shown to be correct using only - // the C++11 standard. See - // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case - std::atomic_thread_fence(std::memory_order_acquire); - - // Increment optimistic counter, then check if it went over the - // boundary - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( - 1, std::memory_order_relaxed); - - // Note that since dequeueOvercommit must be <= - // dequeueOptimisticCount (because dequeueOvercommit is only - // ever incremented after dequeueOptimisticCount -- this is - // enforced in the `else` block below), and since we now have a - // version of dequeueOptimisticCount that is at least as recent - // as overcommit (due to the release upon incrementing - // dequeueOvercommit and the acquire above that synchronizes - // with it), overcommit <= myDequeueCount. However, we can't - // assert this since both dequeueOptimisticCount and - // dequeueOvercommit may (independently) overflow; in such a - // case, though, the logic still holds since the difference - // between the two is maintained. - - // Note that we reload tail here in case it changed; it will be - // the same value as before or greater, since this load is - // sequenced after (happens after) the earlier load above. This - // is supported by read-read coherency (as defined in the - // standard), explained here: - // http://en.cppreference.com/w/cpp/atomic/memory_order - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than( - myDequeueCount - overcommit, tail))) - { - // Guaranteed to be at least one element to dequeue! - - // Get the index. Note that since there's guaranteed to be - // at least one element, this will never exceed tail. We - // need to do an acquire-release fence here since it's - // possible that whatever condition got us to this point was - // for an earlier enqueued element (that we already see the - // memory effects for), but that by the time we increment - // somebody else has incremented it, and we need to see the - // memory effects for *that* element, which is in such a - // case is necessarily visible on the thread that - // incremented it in the first place with the more current - // condition (they must have acquired a tail that is at - // least as recent). - auto index = - this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - // Determine which block the element is in - - auto localBlockIndex = - blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = - localBlockIndex->front.load(std::memory_order_acquire); - - // We need to be careful here about subtracting and dividing - // because of index wrap-around. When an index wraps, we - // need to preserve the sign of the offset when dividing it - // by the block size (in order to get a correct signed block - // count offset in all cases): - auto headBase = - localBlockIndex->entries[localBlockIndexHead].base; - auto blockBaseIndex = - index & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast( - static_cast::type>( - blockBaseIndex - headBase) / - BLOCK_SIZE); - auto block = localBlockIndex - ->entries[(localBlockIndexHead + offset) & - (localBlockIndex->size - 1)] - .block; - - // Dequeue - auto &el = *((*block)[index]); - if (!MOODYCAMEL_NOEXCEPT_ASSIGN( - T, T &&, element = std::move(el))) - { - // Make sure the element is still fully dequeued and - // destroyed even if the assignment throws - struct Guard - { - Block *block; - index_t index; - - ~Guard() - { - (*block)[index]->~T(); - block->ConcurrentQueue::Block:: - template set_empty(index); - } - } guard = {block, index}; - - element = std::move(el); // NOLINT - } - else - { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - block->ConcurrentQueue::Block::template set_empty< - explicit_context>(index); - } - - return true; - } - else - { - // Wasn't anything to dequeue after all; make the effective - // dequeue count eventually consistent - this->dequeueOvercommit.fetch_add( - 1, - std::memory_order_release); // Release so that the - // fetch_add on - // dequeueOptimisticCount - // is guaranteed to happen - // before this write - } - } - - return false; - } - - template - bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of - // the elements; this means pre-allocating blocks and putting them - // in the block index (but only if all the allocations succeeded). - index_t startTailIndex = - this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - auto originalBlockIndexFront = pr_blockIndexFront; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - - Block *firstAllocatedBlock = nullptr; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = - ((startTailIndex + count - 1) & - ~static_cast(BLOCK_SIZE - 1)) - - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = - (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) - { - // Allocate as many blocks as possible from ahead - while (blockBaseDiff > 0 && this->tailBlock != nullptr && - this->tailBlock->next != firstAllocatedBlock && - this->tailBlock->next->ConcurrentQueue::Block:: - template is_empty()) - { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - this->tailBlock = this->tailBlock->next; - firstAllocatedBlock = firstAllocatedBlock == nullptr - ? this->tailBlock - : firstAllocatedBlock; - - auto &entry = blockIndex.load(std::memory_order_relaxed) - ->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = - (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Now allocate as many blocks as necessary from the block pool - while (blockBaseDiff > 0) - { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than( - currentTailIndex, head)); - bool full = - !details::circular_less_than( - head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != - details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || - MAX_SUBQUEUE_SIZE - BLOCK_SIZE < - currentTailIndex - head)); - if (pr_blockIndexRaw == nullptr || - pr_blockIndexSlotsUsed == pr_blockIndexSize || full) - { - MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) - { - // Failed to allocate, undo changes (but keep - // injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = - originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? firstAllocatedBlock - : startBlock; - return false; - } - else if (full || - !new_block_index(originalBlockIndexSlotsUsed)) - { - // Failed to allocate, undo changes (but keep - // injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = - originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? firstAllocatedBlock - : startBlock; - return false; - } - - // pr_blockIndexFront is updated inside new_block_index, - // so we need to update our fallback value too (since we - // keep the new index even if we later fail) - originalBlockIndexFront = originalBlockIndexSlotsUsed; - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue:: - template requisition_block(); - if (newBlock == nullptr) - { - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? firstAllocatedBlock - : startBlock; - return false; - } - -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template set_all_empty< - explicit_context>(); - if (this->tailBlock == nullptr) - { - newBlock->next = newBlock; - } - else - { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr - ? this->tailBlock - : firstAllocatedBlock; - - ++pr_blockIndexSlotsUsed; - - auto &entry = blockIndex.load(std::memory_order_relaxed) - ->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = - (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Excellent, all allocations succeeded. Reset each block's - // emptiness before we fill them up, and publish the new block - // index front - auto block = firstAllocatedBlock; - while (true) - { - block->ConcurrentQueue::Block::template reset_empty< - explicit_context>(); - if (block == this->tailBlock) - { - break; - } - block = block->next; - } - - MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( - T, - decltype(*itemFirst), - new (static_cast(nullptr)) - T(details::deref_noexcept(itemFirst)))) - { - blockIndex.load(std::memory_order_relaxed) - ->front.store( - (pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), - std::memory_order_release); - } - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - auto endBlock = this->tailBlock; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != - 0 || - firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && - firstAllocatedBlock != nullptr) - { - this->tailBlock = firstAllocatedBlock; - } - while (true) - { - index_t stopIndex = - (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, - stopIndex)) - { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( - T, - decltype(*itemFirst), - new (static_cast(nullptr)) - T(details::deref_noexcept(itemFirst)))) - { - while (currentTailIndex != stopIndex) - { - new ((*this->tailBlock)[currentTailIndex++]) - T(*itemFirst++); - } - } - else - { - MOODYCAMEL_TRY - { - while (currentTailIndex != stopIndex) - { - // Must use copy constructor even if move - // constructor is available because we may have to - // revert if there's an exception. Sorry about the - // horrible templated next line, but it was the only - // way to disable moving *at compile time*, which is - // important because a type may only define a - // (noexcept) move constructor, and so calls to the - // cctor will not compile, even if they are in an if - // branch that will never be executed - new ((*this->tailBlock)[currentTailIndex]) - T(details::nomove_if(nullptr)) - T(details::deref_noexcept( - itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH(...) - { - // Oh dear, an exception's been thrown -- destroy the - // elements that were enqueued so far and revert the - // entire bulk operation (we'll keep any allocated - // blocks in our linked list for later, though). - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? firstAllocatedBlock - : startBlock; - - if (!details::is_trivially_destructible::value) - { - auto block = startBlock; - if ((startTailIndex & - static_cast(BLOCK_SIZE - 1)) == 0) - { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) - { - stopIndex = - (currentTailIndex & - ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - if (details::circular_less_than( - constructedStopIndex, stopIndex)) - { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) - { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) - { - break; - } - block = block->next; - } - } - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) - { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - - MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( - T, - decltype(*itemFirst), - new (static_cast(nullptr)) - T(details::deref_noexcept(itemFirst)))) - { - if (firstAllocatedBlock != nullptr) - blockIndex.load(std::memory_order_relaxed) - ->front.store( - (pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), - std::memory_order_release); - } - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - size_t dequeue_bulk(It &itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = - this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast( - tail - - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - - overcommit)); - if (details::circular_less_than(0, desiredCount)) - { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( - desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = - static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) - { - actualCount = - desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) - { - this->dequeueOvercommit.fetch_add( - desiredCount - actualCount, - std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed - // to be at least actualCount elements, this will never - // exceed tail. - auto firstIndex = this->headIndex.fetch_add( - actualCount, std::memory_order_acq_rel); - - // Determine which block the first element is in - auto localBlockIndex = - blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = - localBlockIndex->front.load(std::memory_order_acquire); - - auto headBase = - localBlockIndex->entries[localBlockIndexHead].base; - auto firstBlockBaseIndex = - firstIndex & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast( - static_cast::type>( - firstBlockBaseIndex - headBase) / - BLOCK_SIZE); - auto indexIndex = (localBlockIndexHead + offset) & - (localBlockIndex->size - 1); - - // Iterate the blocks and dequeue - auto index = firstIndex; - do - { - auto firstIndexInBlock = index; - index_t endIndex = - (index & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - endIndex = - details::circular_less_than( - firstIndex + static_cast(actualCount), - endIndex) - ? firstIndex + static_cast(actualCount) - : endIndex; - auto block = localBlockIndex->entries[indexIndex].block; - if (MOODYCAMEL_NOEXCEPT_ASSIGN( - T, - T &&, - details::deref_noexcept(itemFirst) = - std::move((*(*block)[index])))) - { - while (index != endIndex) - { - auto &el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else - { - MOODYCAMEL_TRY - { - while (index != endIndex) - { - auto &el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH(...) - { - // It's too late to revert the dequeue, but we - // can make sure that all the dequeued objects - // are properly destroyed and the block index - // (and empty count) are properly updated before - // we propagate the exception - do - { - block = localBlockIndex->entries[indexIndex] - .block; - while (index != endIndex) - { - (*block)[index++]->~T(); - } - block->ConcurrentQueue::Block:: - template set_many_empty< - explicit_context>( - firstIndexInBlock, - static_cast( - endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & - (localBlockIndex->size - 1); - - firstIndexInBlock = index; - endIndex = (index & ~static_cast( - BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - endIndex = - details::circular_less_than( - firstIndex + static_cast( - actualCount), - endIndex) - ? firstIndex + static_cast( - actualCount) - : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - block->ConcurrentQueue::Block::template set_many_empty< - explicit_context>( - firstIndexInBlock, - static_cast(endIndex - firstIndexInBlock)); - indexIndex = - (indexIndex + 1) & (localBlockIndex->size - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else - { - // Wasn't anything to dequeue after all; make the effective - // dequeue count eventually consistent - this->dequeueOvercommit.fetch_add( - desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - struct BlockIndexEntry - { - index_t base; - Block *block; - }; - - struct BlockIndexHeader - { - size_t size; - std::atomic - front; // Current slot (not next, like pr_blockIndexFront) - BlockIndexEntry *entries; - void *prev; - }; - - bool new_block_index(size_t numberOfFilledSlotsToExpose) - { - auto prevBlockSizeMask = pr_blockIndexSize - 1; - - // Create the new block - pr_blockIndexSize <<= 1; - auto newRawPtr = static_cast( - (Traits::malloc)(sizeof(BlockIndexHeader) + - std::alignment_of::value - 1 + - sizeof(BlockIndexEntry) * pr_blockIndexSize)); - if (newRawPtr == nullptr) - { - pr_blockIndexSize >>= 1; // Reset to allow graceful retry - return false; - } - - auto newBlockIndexEntries = reinterpret_cast( - details::align_for(newRawPtr + - sizeof(BlockIndexHeader))); - - // Copy in all the old indices, if any - size_t j = 0; - if (pr_blockIndexSlotsUsed != 0) - { - auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & - prevBlockSizeMask; - do - { - newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; - i = (i + 1) & prevBlockSizeMask; - } while (i != pr_blockIndexFront); - } - - // Update everything - auto header = new (newRawPtr) BlockIndexHeader; - header->size = pr_blockIndexSize; - header->front.store(numberOfFilledSlotsToExpose - 1, - std::memory_order_relaxed); - header->entries = newBlockIndexEntries; - header->prev = pr_blockIndexRaw; // we link the new block to the - // old one so we can free it later - - pr_blockIndexFront = j; - pr_blockIndexEntries = newBlockIndexEntries; - pr_blockIndexRaw = newRawPtr; - blockIndex.store(header, std::memory_order_release); - - return true; - } - - private: - std::atomic blockIndex; - - // To be used by producer only -- consumer must use the ones in - // referenced by blockIndex - size_t pr_blockIndexSlotsUsed; - size_t pr_blockIndexSize; - size_t pr_blockIndexFront; // Next slot (not current) - BlockIndexEntry *pr_blockIndexEntries; - void *pr_blockIndexRaw; - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ExplicitProducer *nextExplicitProducer; - - private: -#endif - -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - ////////////////////////////////// - // Implicit queue - ////////////////////////////////// - - struct ImplicitProducer : public ProducerBase - { - ImplicitProducer(ConcurrentQueue *parent_) - : ProducerBase(parent_, false), - nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), - blockIndex(nullptr) - { - new_block_index(); - } - - ~ImplicitProducer() - { - // Note that since we're in the destructor we can assume that all - // enqueue/dequeue operations completed already; this means that all - // undequeued elements are placed contiguously across contiguous - // blocks, and that only the first and last remaining blocks can be - // only partially empty (all other remaining blocks must be - // completely full). - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - // Unregister ourselves for thread termination notification - if (!this->inactive.load(std::memory_order_relaxed)) - { - details::ThreadExitNotifier::unsubscribe(&threadExitListener); - } -#endif - - // Destroy all remaining elements! - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto index = this->headIndex.load(std::memory_order_relaxed); - Block *block = nullptr; - assert(index == tail || details::circular_less_than(index, tail)); - bool forceFreeLastBlock = - index != tail; // If we enter the loop, then the last (tail) - // block will not be freed - while (index != tail) - { - if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || - block == nullptr) - { - if (block != nullptr) - { - // Free the old block - this->parent->add_block_to_free_list(block); - } - - block = get_block_index_entry_for_index(index)->value.load( - std::memory_order_relaxed); - } - - ((*block)[index])->~T(); - ++index; - } - // Even if the queue is empty, there's still one block that's not on - // the free list (unless the head index reached the end of it, in - // which case the tail will be poised to create a new block). - if (this->tailBlock != nullptr && - (forceFreeLastBlock || - (tail & static_cast(BLOCK_SIZE - 1)) != 0)) - { - this->parent->add_block_to_free_list(this->tailBlock); - } - - // Destroy block index - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - if (localBlockIndex != nullptr) - { - for (size_t i = 0; i != localBlockIndex->capacity; ++i) - { - localBlockIndex->index[i]->~BlockIndexEntry(); - } - do - { - auto prev = localBlockIndex->prev; - localBlockIndex->~BlockIndexHeader(); - (Traits::free)(localBlockIndex); - localBlockIndex = prev; - } while (localBlockIndex != nullptr); - } - } - - template - inline bool enqueue(U &&element) - { - index_t currentTailIndex = - this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) - { - // We reached the end of a block, start a new one - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, - head)); - if (!details::circular_less_than( - head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != - details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < - currentTailIndex - head))) - { - return false; - } -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Find out where we'll be inserting this block in the block - // index - BlockIndexEntry *idxEntry; - if (!insert_block_index_entry(idxEntry, - currentTailIndex)) - { - return false; - } - - // Get ahold of a new block - auto newBlock = - this->parent->ConcurrentQueue::template requisition_block< - allocMode>(); - if (newBlock == nullptr) - { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - return false; - } -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty< - implicit_context>(); - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, - U, - new (static_cast(nullptr)) - T(std::forward(element)))) - { - // May throw, try to insert now before we publish the fact - // that we have this new block - MOODYCAMEL_TRY - { - new ((*newBlock)[currentTailIndex]) - T(std::forward(element)); - } - MOODYCAMEL_CATCH(...) - { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, - std::memory_order_relaxed); - this->parent->add_block_to_free_list(newBlock); - MOODYCAMEL_RETHROW; - } - } - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - this->tailBlock = newBlock; - - MOODYCAMEL_CONSTEXPR_IF( - !MOODYCAMEL_NOEXCEPT_CTOR(T, - U, - new (static_cast(nullptr)) - T(std::forward(element)))) - { - this->tailIndex.store(newTailIndex, - std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) - T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U &element) - { - // See ExplicitProducer::dequeue for rationale and explanation - index_t tail = this->tailIndex.load(std::memory_order_relaxed); - index_t overcommit = - this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than( - this->dequeueOptimisticCount.load( - std::memory_order_relaxed) - - overcommit, - tail)) - { - std::atomic_thread_fence(std::memory_order_acquire); - - index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add( - 1, std::memory_order_relaxed); - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than( - myDequeueCount - overcommit, tail))) - { - index_t index = - this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - // Determine which block the element is in - auto entry = get_block_index_entry_for_index(index); - - // Dequeue - auto block = entry->value.load(std::memory_order_relaxed); - auto &el = *((*block)[index]); - - if (!MOODYCAMEL_NOEXCEPT_ASSIGN( - T, T &&, element = std::move(el))) - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - // Note: Acquiring the mutex with every dequeue instead - // of only when a block is released is very sub-optimal, - // but it is, after all, purely debug code. - debug::DebugLock lock(producer->mutex); -#endif - struct Guard - { - Block *block; - index_t index; - BlockIndexEntry *entry; - ConcurrentQueue *parent; - - ~Guard() - { - (*block)[index]->~T(); - if (block->ConcurrentQueue::Block:: - template set_empty( - index)) - { - entry->value.store( - nullptr, std::memory_order_relaxed); - parent->add_block_to_free_list(block); - } - } - } guard = {block, index, entry, this->parent}; - - element = std::move(el); // NOLINT - } - else - { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - - if (block->ConcurrentQueue::Block::template set_empty< - implicit_context>(index)) - { - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Add the block back into the global free pool - // (and remove from block index) - entry->value.store(nullptr, - std::memory_order_relaxed); - } - this->parent->add_block_to_free_list( - block); // releases the above store - } - } - - return true; - } - else - { - this->dequeueOvercommit.fetch_add( - 1, std::memory_order_release); - } - } - - return false; - } - -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4706) // assignment within conditional expression -#endif - template - bool enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of - // the elements; this means pre-allocating blocks and putting them - // in the block index (but only if all the allocations succeeded). - - // Note that the tailBlock we start off with may not be owned by us - // any more; this happens if it was filled up exactly to the top - // (setting tailIndex to the first index of the next block which is - // not yet allocated), then dequeued completely (putting it on the - // free list) before we enqueue again. - - index_t startTailIndex = - this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - Block *firstAllocatedBlock = nullptr; - auto endBlock = this->tailBlock; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = - ((startTailIndex + count - 1) & - ~static_cast(BLOCK_SIZE - 1)) - - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = - (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - do - { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - // Find out where we'll be inserting this block in the block - // index - BlockIndexEntry *idxEntry = - nullptr; // initialization here unnecessary but - // compiler can't always tell - Block *newBlock; - bool indexInserted = false; - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than( - currentTailIndex, head)); - bool full = - !details::circular_less_than( - head, currentTailIndex + BLOCK_SIZE) || - (MAX_SUBQUEUE_SIZE != - details::const_numeric_max::value && - (MAX_SUBQUEUE_SIZE == 0 || - MAX_SUBQUEUE_SIZE - BLOCK_SIZE < - currentTailIndex - head)); - - if (full || - !(indexInserted = insert_block_index_entry( - idxEntry, currentTailIndex)) || - (newBlock = - this->parent->ConcurrentQueue:: - template requisition_block()) == - nullptr) - { - // Index allocation or block allocation failed; revert - // any other allocations and index insertions done so - // far for this operation - if (indexInserted) - { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, - std::memory_order_relaxed); - } - currentTailIndex = - (startTailIndex - 1) & - ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; - block = block->next) - { - currentTailIndex += - static_cast(BLOCK_SIZE); - idxEntry = get_block_index_entry_for_index( - currentTailIndex); - idxEntry->value.store(nullptr, - std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list( - firstAllocatedBlock); - this->tailBlock = startBlock; - - return false; - } - -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty< - implicit_context>(); - newBlock->next = nullptr; - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - // Store the chain of blocks so that we can undo if later - // allocations fail, and so that we can find the blocks when - // we do the actual enqueueing - if ((startTailIndex & - static_cast(BLOCK_SIZE - 1)) != 0 || - firstAllocatedBlock != nullptr) - { - assert(this->tailBlock != nullptr); - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - endBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr - ? newBlock - : firstAllocatedBlock; - } while (blockBaseDiff > 0); - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != - 0 || - firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && - firstAllocatedBlock != nullptr) - { - this->tailBlock = firstAllocatedBlock; - } - while (true) - { - index_t stopIndex = - (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, - stopIndex)) - { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( - T, - decltype(*itemFirst), - new (static_cast(nullptr)) - T(details::deref_noexcept(itemFirst)))) - { - while (currentTailIndex != stopIndex) - { - new ((*this->tailBlock)[currentTailIndex++]) - T(*itemFirst++); - } - } - else - { - MOODYCAMEL_TRY - { - while (currentTailIndex != stopIndex) - { - new ((*this->tailBlock)[currentTailIndex]) - T(details::nomove_if(nullptr)) - T(details::deref_noexcept( - itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH(...) - { - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - if (!details::is_trivially_destructible::value) - { - auto block = startBlock; - if ((startTailIndex & - static_cast(BLOCK_SIZE - 1)) == 0) - { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) - { - stopIndex = - (currentTailIndex & - ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - if (details::circular_less_than( - constructedStopIndex, stopIndex)) - { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) - { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) - { - break; - } - block = block->next; - } - } - - currentTailIndex = - (startTailIndex - 1) & - ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; - block = block->next) - { - currentTailIndex += - static_cast(BLOCK_SIZE); - auto idxEntry = get_block_index_entry_for_index( - currentTailIndex); - idxEntry->value.store(nullptr, - std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list( - firstAllocatedBlock); - this->tailBlock = startBlock; - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) - { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } -#ifdef _MSC_VER -#pragma warning(pop) -#endif - - template - size_t dequeue_bulk(It &itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = - this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast( - tail - - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - - overcommit)); - if (details::circular_less_than(0, desiredCount)) - { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( - desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = - static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) - { - actualCount = - desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) - { - this->dequeueOvercommit.fetch_add( - desiredCount - actualCount, - std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed - // to be at least actualCount elements, this will never - // exceed tail. - auto firstIndex = this->headIndex.fetch_add( - actualCount, std::memory_order_acq_rel); - - // Iterate the blocks and dequeue - auto index = firstIndex; - BlockIndexHeader *localBlockIndex; - auto indexIndex = - get_block_index_index_for_index(index, localBlockIndex); - do - { - auto blockStartIndex = index; - index_t endIndex = - (index & ~static_cast(BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - endIndex = - details::circular_less_than( - firstIndex + static_cast(actualCount), - endIndex) - ? firstIndex + static_cast(actualCount) - : endIndex; - - auto entry = localBlockIndex->index[indexIndex]; - auto block = - entry->value.load(std::memory_order_relaxed); - if (MOODYCAMEL_NOEXCEPT_ASSIGN( - T, - T &&, - details::deref_noexcept(itemFirst) = - std::move((*(*block)[index])))) - { - while (index != endIndex) - { - auto &el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else - { - MOODYCAMEL_TRY - { - while (index != endIndex) - { - auto &el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH(...) - { - do - { - entry = localBlockIndex->index[indexIndex]; - block = entry->value.load( - std::memory_order_relaxed); - while (index != endIndex) - { - (*block)[index++]->~T(); - } - - if (block->ConcurrentQueue::Block:: - template set_many_empty< - implicit_context>( - blockStartIndex, - static_cast( - endIndex - - blockStartIndex))) - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - entry->value.store( - nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list( - block); - } - indexIndex = - (indexIndex + 1) & - (localBlockIndex->capacity - 1); - - blockStartIndex = index; - endIndex = (index & ~static_cast( - BLOCK_SIZE - 1)) + - static_cast(BLOCK_SIZE); - endIndex = - details::circular_less_than( - firstIndex + static_cast( - actualCount), - endIndex) - ? firstIndex + static_cast( - actualCount) - : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - if (block->ConcurrentQueue::Block:: - template set_many_empty( - blockStartIndex, - static_cast(endIndex - - blockStartIndex))) - { - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Note that the set_many_empty above did a - // release, meaning that anybody who acquires - // the block we're about to free can use it - // safely since our writes (and reads!) will - // have happened-before then. - entry->value.store(nullptr, - std::memory_order_relaxed); - } - this->parent->add_block_to_free_list( - block); // releases the above store - } - indexIndex = - (indexIndex + 1) & (localBlockIndex->capacity - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else - { - this->dequeueOvercommit.fetch_add( - desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - // The block size must be > 1, so any number with the low bit set is an - // invalid block base index - static const index_t INVALID_BLOCK_BASE = 1; - - struct BlockIndexEntry - { - std::atomic key; - std::atomic value; - }; - - struct BlockIndexHeader - { - size_t capacity; - std::atomic tail; - BlockIndexEntry *entries; - BlockIndexEntry **index; - BlockIndexHeader *prev; - }; - - template - inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry, - index_t blockStartIndex) - { - auto localBlockIndex = blockIndex.load( - std::memory_order_relaxed); // We're the only writer thread, - // relaxed is OK - if (localBlockIndex == nullptr) - { - return false; // this can happen if new_block_index failed in - // the constructor - } - size_t newTail = - (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & - (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - if (idxEntry->key.load(std::memory_order_relaxed) == - INVALID_BLOCK_BASE || - idxEntry->value.load(std::memory_order_relaxed) == nullptr) - { - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - - // No room in the old block index, try to allocate another one! - MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) - { - return false; - } - else if (!new_block_index()) - { - return false; - } - localBlockIndex = blockIndex.load(std::memory_order_relaxed); - newTail = - (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & - (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - assert(idxEntry->key.load(std::memory_order_relaxed) == - INVALID_BLOCK_BASE); - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - - inline void rewind_block_index_tail() - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - localBlockIndex->tail.store( - (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & - (localBlockIndex->capacity - 1), - std::memory_order_relaxed); - } - - inline BlockIndexEntry *get_block_index_entry_for_index( - index_t index) const - { - BlockIndexHeader *localBlockIndex; - auto idx = get_block_index_index_for_index(index, localBlockIndex); - return localBlockIndex->index[idx]; - } - - inline size_t get_block_index_index_for_index( - index_t index, BlockIndexHeader *&localBlockIndex) const - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - index &= ~static_cast(BLOCK_SIZE - 1); - localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto tail = localBlockIndex->tail.load(std::memory_order_acquire); - auto tailBase = localBlockIndex->index[tail]->key.load( - std::memory_order_relaxed); - assert(tailBase != INVALID_BLOCK_BASE); - // Note: Must use division instead of shift because the index may - // wrap around, causing a negative offset, whose negativity we want - // to preserve - auto offset = static_cast( - static_cast::type>( - index - tailBase) / - BLOCK_SIZE); - size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); - assert(localBlockIndex->index[idx]->key.load( - std::memory_order_relaxed) == index && - localBlockIndex->index[idx]->value.load( - std::memory_order_relaxed) != nullptr); - return idx; - } - - bool new_block_index() - { - auto prev = blockIndex.load(std::memory_order_relaxed); - size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; - auto entryCount = - prev == nullptr ? nextBlockIndexCapacity : prevCapacity; - auto raw = static_cast((Traits::malloc)( - sizeof(BlockIndexHeader) + - std::alignment_of::value - 1 + - sizeof(BlockIndexEntry) * entryCount + - std::alignment_of::value - 1 + - sizeof(BlockIndexEntry *) * nextBlockIndexCapacity)); - if (raw == nullptr) - { - return false; - } - - auto header = new (raw) BlockIndexHeader; - auto entries = reinterpret_cast( - details::align_for(raw + - sizeof(BlockIndexHeader))); - auto index = reinterpret_cast( - details::align_for( - reinterpret_cast(entries) + - sizeof(BlockIndexEntry) * entryCount)); - if (prev != nullptr) - { - auto prevTail = prev->tail.load(std::memory_order_relaxed); - auto prevPos = prevTail; - size_t i = 0; - do - { - prevPos = (prevPos + 1) & (prev->capacity - 1); - index[i++] = prev->index[prevPos]; - } while (prevPos != prevTail); - assert(i == prevCapacity); - } - for (size_t i = 0; i != entryCount; ++i) - { - new (entries + i) BlockIndexEntry; - entries[i].key.store(INVALID_BLOCK_BASE, - std::memory_order_relaxed); - index[prevCapacity + i] = entries + i; - } - header->prev = prev; - header->entries = entries; - header->index = index; - header->capacity = nextBlockIndexCapacity; - header->tail.store( - (prevCapacity - 1) & (nextBlockIndexCapacity - 1), - std::memory_order_relaxed); - - blockIndex.store(header, std::memory_order_release); - - nextBlockIndexCapacity <<= 1; - - return true; - } - - private: - size_t nextBlockIndexCapacity; - std::atomic blockIndex; - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - public: - details::ThreadExitListener threadExitListener; - - private: -#endif - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ImplicitProducer *nextImplicitProducer; - - private: -#endif - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - mutable debug::DebugMutex mutex; -#endif -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - ////////////////////////////////// - // Block pool manipulation - ////////////////////////////////// - - void populate_initial_block_list(size_t blockCount) - { - initialBlockPoolSize = blockCount; - if (initialBlockPoolSize == 0) - { - initialBlockPool = nullptr; - return; - } - - initialBlockPool = create_array(blockCount); - if (initialBlockPool == nullptr) - { - initialBlockPoolSize = 0; - } - for (size_t i = 0; i < initialBlockPoolSize; ++i) - { - initialBlockPool[i].dynamicallyAllocated = false; - } - } - - inline Block *try_get_block_from_initial_pool() - { - if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= - initialBlockPoolSize) - { - return nullptr; - } - - auto index = - initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); - - return index < initialBlockPoolSize ? (initialBlockPool + index) - : nullptr; - } - - inline void add_block_to_free_list(Block *block) - { -#ifdef MCDBGQ_TRACKMEM - block->owner = nullptr; -#endif - freeList.add(block); - } - - inline void add_blocks_to_free_list(Block *block) - { - while (block != nullptr) - { - auto next = block->next; - add_block_to_free_list(block); - block = next; - } - } - - inline Block *try_get_block_from_free_list() - { - return freeList.try_get(); - } - - // Gets a free block from one of the memory pools, or allocates a new one - // (if applicable) - template - Block *requisition_block() - { - auto block = try_get_block_from_initial_pool(); - if (block != nullptr) - { - return block; - } - - block = try_get_block_from_free_list(); - if (block != nullptr) - { - return block; - } - - MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) - { - return create(); - } - else - { - return nullptr; - } - } - -#ifdef MCDBGQ_TRACKMEM -public: - struct MemStats - { - size_t allocatedBlocks; - size_t usedBlocks; - size_t freeBlocks; - size_t ownedBlocksExplicit; - size_t ownedBlocksImplicit; - size_t implicitProducers; - size_t explicitProducers; - size_t elementsEnqueued; - size_t blockClassBytes; - size_t queueClassBytes; - size_t implicitBlockIndexBytes; - size_t explicitBlockIndexBytes; - - friend class ConcurrentQueue; - - private: - static MemStats getFor(ConcurrentQueue *q) - { - MemStats stats = {0}; - - stats.elementsEnqueued = q->size_approx(); - - auto block = q->freeList.head_unsafe(); - while (block != nullptr) - { - ++stats.allocatedBlocks; - ++stats.freeBlocks; - block = block->freeListNext.load(std::memory_order_relaxed); - } - - for (auto ptr = q->producerListTail.load(std::memory_order_acquire); - ptr != nullptr; - ptr = ptr->next_prod()) - { - bool implicit = - dynamic_cast(ptr) != nullptr; - stats.implicitProducers += implicit ? 1 : 0; - stats.explicitProducers += implicit ? 0 : 1; - - if (implicit) - { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ImplicitProducer); - auto head = prod->headIndex.load(std::memory_order_relaxed); - auto tail = prod->tailIndex.load(std::memory_order_relaxed); - auto hash = - prod->blockIndex.load(std::memory_order_relaxed); - if (hash != nullptr) - { - for (size_t i = 0; i != hash->capacity; ++i) - { - if (hash->index[i]->key.load( - std::memory_order_relaxed) != - ImplicitProducer::INVALID_BLOCK_BASE && - hash->index[i]->value.load( - std::memory_order_relaxed) != nullptr) - { - ++stats.allocatedBlocks; - ++stats.ownedBlocksImplicit; - } - } - stats.implicitBlockIndexBytes += - hash->capacity * - sizeof(typename ImplicitProducer::BlockIndexEntry); - for (; hash != nullptr; hash = hash->prev) - { - stats.implicitBlockIndexBytes += - sizeof(typename ImplicitProducer:: - BlockIndexHeader) + - hash->capacity * - sizeof(typename ImplicitProducer:: - BlockIndexEntry *); - } - } - for (; details::circular_less_than(head, tail); - head += BLOCK_SIZE) - { - // auto block = - // prod->get_block_index_entry_for_index(head); - ++stats.usedBlocks; - } - } - else - { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ExplicitProducer); - auto tailBlock = prod->tailBlock; - bool wasNonEmpty = false; - if (tailBlock != nullptr) - { - auto block = tailBlock; - do - { - ++stats.allocatedBlocks; - if (!block->ConcurrentQueue::Block:: - template is_empty() || - wasNonEmpty) - { - ++stats.usedBlocks; - wasNonEmpty = wasNonEmpty || block != tailBlock; - } - ++stats.ownedBlocksExplicit; - block = block->next; - } while (block != tailBlock); - } - auto index = - prod->blockIndex.load(std::memory_order_relaxed); - while (index != nullptr) - { - stats.explicitBlockIndexBytes += - sizeof( - typename ExplicitProducer::BlockIndexHeader) + - index->size * - sizeof( - typename ExplicitProducer::BlockIndexEntry); - index = static_cast< - typename ExplicitProducer::BlockIndexHeader *>( - index->prev); - } - } - } - - auto freeOnInitialPool = - q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= - q->initialBlockPoolSize - ? 0 - : q->initialBlockPoolSize - q->initialBlockPoolIndex.load( - std::memory_order_relaxed); - stats.allocatedBlocks += freeOnInitialPool; - stats.freeBlocks += freeOnInitialPool; - - stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; - stats.queueClassBytes += sizeof(ConcurrentQueue); - - return stats; - } - }; - - // For debugging only. Not thread-safe. - MemStats getMemStats() - { - return MemStats::getFor(this); - } - -private: - friend struct MemStats; -#endif - - ////////////////////////////////// - // Producer list manipulation - ////////////////////////////////// - - ProducerBase *recycle_or_create_producer(bool isExplicit) - { - bool recycled; - return recycle_or_create_producer(isExplicit, recycled); - } - - ProducerBase *recycle_or_create_producer(bool isExplicit, bool &recycled) - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - // Try to re-use one first - for (auto ptr = producerListTail.load(std::memory_order_acquire); - ptr != nullptr; - ptr = ptr->next_prod()) - { - if (ptr->inactive.load(std::memory_order_relaxed) && - ptr->isExplicit == isExplicit) - { - bool expected = true; - if (ptr->inactive.compare_exchange_strong( - expected, - /* desired */ false, - std::memory_order_acquire, - std::memory_order_relaxed)) - { - // We caught one! It's been marked as activated, the caller - // can have it - recycled = true; - return ptr; - } - } - } - - recycled = false; - return add_producer(isExplicit ? static_cast( - create(this)) - : create(this)); - } - - ProducerBase *add_producer(ProducerBase *producer) - { - // Handle failed memory allocation - if (producer == nullptr) - { - return nullptr; - } - - producerCount.fetch_add(1, std::memory_order_relaxed); - - // Add it to the lock-free list - auto prevTail = producerListTail.load(std::memory_order_relaxed); - do - { - producer->next = prevTail; - } while ( - !producerListTail.compare_exchange_weak(prevTail, - producer, - std::memory_order_release, - std::memory_order_relaxed)); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - if (producer->isExplicit) - { - auto prevTailExplicit = - explicitProducers.load(std::memory_order_relaxed); - do - { - static_cast(producer) - ->nextExplicitProducer = prevTailExplicit; - } while (!explicitProducers.compare_exchange_weak( - prevTailExplicit, - static_cast(producer), - std::memory_order_release, - std::memory_order_relaxed)); - } - else - { - auto prevTailImplicit = - implicitProducers.load(std::memory_order_relaxed); - do - { - static_cast(producer) - ->nextImplicitProducer = prevTailImplicit; - } while (!implicitProducers.compare_exchange_weak( - prevTailImplicit, - static_cast(producer), - std::memory_order_release, - std::memory_order_relaxed)); - } -#endif - - return producer; - } - - void reown_producers() - { - // After another instance is moved-into/swapped-with this one, all the - // producers we stole still think their parents are the other queue. - // So fix them up! - for (auto ptr = producerListTail.load(std::memory_order_relaxed); - ptr != nullptr; - ptr = ptr->next_prod()) - { - ptr->parent = this; - } - } - - ////////////////////////////////// - // Implicit producer hash - ////////////////////////////////// - - struct ImplicitProducerKVP - { - std::atomic key; - ImplicitProducer - *value; // No need for atomicity since it's only read by the thread - // that sets it in the first place - - ImplicitProducerKVP() : value(nullptr) - { - } - - ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT - { - key.store(other.key.load(std::memory_order_relaxed), - std::memory_order_relaxed); - value = other.value; - } - - inline ImplicitProducerKVP &operator=(ImplicitProducerKVP &&other) - MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT - { - if (this != &other) - { - details::swap_relaxed(key, other.key); - std::swap(value, other.value); - } - } - }; - - template - friend void moodycamel::swap( - typename ConcurrentQueue::ImplicitProducerKVP &, - typename ConcurrentQueue::ImplicitProducerKVP &) - MOODYCAMEL_NOEXCEPT; - - struct ImplicitProducerHash - { - size_t capacity; - ImplicitProducerKVP *entries; - ImplicitProducerHash *prev; - }; - - inline void populate_initial_implicit_producer_hash() - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - { - return; - } - else - { - implicitProducerHashCount.store(0, std::memory_order_relaxed); - auto hash = &initialImplicitProducerHash; - hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; - hash->entries = &initialImplicitProducerHashEntries[0]; - for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) - { - initialImplicitProducerHashEntries[i].key.store( - details::invalid_thread_id, std::memory_order_relaxed); - } - hash->prev = nullptr; - implicitProducerHash.store(hash, std::memory_order_relaxed); - } - } - - void swap_implicit_producer_hashes(ConcurrentQueue &other) - { - MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - { - return; - } - else - { - // Swap (assumes our implicit producer hash is initialized) - initialImplicitProducerHashEntries.swap( - other.initialImplicitProducerHashEntries); - initialImplicitProducerHash.entries = - &initialImplicitProducerHashEntries[0]; - other.initialImplicitProducerHash.entries = - &other.initialImplicitProducerHashEntries[0]; - - details::swap_relaxed(implicitProducerHashCount, - other.implicitProducerHashCount); - - details::swap_relaxed(implicitProducerHash, - other.implicitProducerHash); - if (implicitProducerHash.load(std::memory_order_relaxed) == - &other.initialImplicitProducerHash) - { - implicitProducerHash.store(&initialImplicitProducerHash, - std::memory_order_relaxed); - } - else - { - ImplicitProducerHash *hash; - for (hash = - implicitProducerHash.load(std::memory_order_relaxed); - hash->prev != &other.initialImplicitProducerHash; - hash = hash->prev) - { - continue; - } - hash->prev = &initialImplicitProducerHash; - } - if (other.implicitProducerHash.load(std::memory_order_relaxed) == - &initialImplicitProducerHash) - { - other.implicitProducerHash.store( - &other.initialImplicitProducerHash, - std::memory_order_relaxed); - } - else - { - ImplicitProducerHash *hash; - for (hash = other.implicitProducerHash.load( - std::memory_order_relaxed); - hash->prev != &initialImplicitProducerHash; - hash = hash->prev) - { - continue; - } - hash->prev = &other.initialImplicitProducerHash; - } - } - } - - // Only fails (returns nullptr) if memory allocation fails - ImplicitProducer *get_or_add_implicit_producer() - { - // Note that since the data is essentially thread-local (key is thread - // ID), there's a reduced need for fences (memory ordering is already - // consistent for any individual thread), except for the current table - // itself. - - // Start by looking for the thread ID in the current and all previous - // hash tables. If it's not found, it must not be in there yet, since - // this same thread would have added it previously to one of the tables - // that we traversed. - - // Code and algorithm adapted from - // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - - auto mainHash = implicitProducerHash.load(std::memory_order_acquire); - assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings - // (hash cannot be null) - for (auto hash = mainHash; hash != nullptr; hash = hash->prev) - { - // Look for the id in this hash - auto index = hashedId; - while (true) - { // Not an infinite loop because at least one slot is free in the - // hash table - index &= hash->capacity - 1; - - auto probedKey = - hash->entries[index].key.load(std::memory_order_relaxed); - if (probedKey == id) - { - // Found it! If we had to search several hashes deep, - // though, we should lazily add it to the current main hash - // table to avoid the extended search next time. Note - // there's guaranteed to be room in the current hash table - // since every subsequent table implicitly reserves space - // for all previous tables (there's only one - // implicitProducerHashCount). - auto value = hash->entries[index].value; - if (hash != mainHash) - { - index = hashedId; - while (true) - { - index &= mainHash->capacity - 1; - probedKey = mainHash->entries[index].key.load( - std::memory_order_relaxed); - auto empty = details::invalid_thread_id; -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if ((probedKey == empty && - mainHash->entries[index] - .key.compare_exchange_strong( - empty, - id, - std::memory_order_relaxed, - std::memory_order_relaxed)) || - (probedKey == reusable && - mainHash->entries[index] - .key.compare_exchange_strong( - reusable, - id, - std::memory_order_acquire, - std::memory_order_acquire))) - { -#else - if ((probedKey == empty && - mainHash->entries[index] - .key.compare_exchange_strong( - empty, - id, - std::memory_order_relaxed, - std::memory_order_relaxed))) - { -#endif - mainHash->entries[index].value = value; - break; - } - ++index; - } - } - - return value; - } - if (probedKey == details::invalid_thread_id) - { - break; // Not in this hash table - } - ++index; - } - } - - // Insert! - auto newCount = 1 + implicitProducerHashCount.fetch_add( - 1, std::memory_order_relaxed); - while (true) - { - // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) - if (newCount >= (mainHash->capacity >> 1) && - !implicitProducerHashResizeInProgress.test_and_set( - std::memory_order_acquire)) - { - // We've acquired the resize lock, try to allocate a bigger hash - // table. Note the acquire fence synchronizes with the release - // fence at the end of this block, and hence when we reload - // implicitProducerHash it must be the most recent version (it - // only gets changed within this locked block). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - if (newCount >= (mainHash->capacity >> 1)) - { - auto newCapacity = mainHash->capacity << 1; - while (newCount >= (newCapacity >> 1)) - { - newCapacity <<= 1; - } - auto raw = static_cast((Traits::malloc)( - sizeof(ImplicitProducerHash) + - std::alignment_of::value - 1 + - sizeof(ImplicitProducerKVP) * newCapacity)); - if (raw == nullptr) - { - // Allocation failed - implicitProducerHashCount.fetch_sub( - 1, std::memory_order_relaxed); - implicitProducerHashResizeInProgress.clear( - std::memory_order_relaxed); - return nullptr; - } - - auto newHash = new (raw) ImplicitProducerHash; - newHash->capacity = static_cast(newCapacity); - newHash->entries = reinterpret_cast( - details::align_for( - raw + sizeof(ImplicitProducerHash))); - for (size_t i = 0; i != newCapacity; ++i) - { - new (newHash->entries + i) ImplicitProducerKVP; - newHash->entries[i].key.store( - details::invalid_thread_id, - std::memory_order_relaxed); - } - newHash->prev = mainHash; - implicitProducerHash.store(newHash, - std::memory_order_release); - implicitProducerHashResizeInProgress.clear( - std::memory_order_release); - mainHash = newHash; - } - else - { - implicitProducerHashResizeInProgress.clear( - std::memory_order_release); - } - } - - // If it's < three-quarters full, add to the old one anyway so that - // we don't have to wait for the next table to finish being - // allocated by another thread (and if we just finished allocating - // above, the condition will always be true) - if (newCount < - (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) - { - bool recycled; - auto producer = static_cast( - recycle_or_create_producer(false, recycled)); - if (producer == nullptr) - { - implicitProducerHashCount.fetch_sub( - 1, std::memory_order_relaxed); - return nullptr; - } - if (recycled) - { - implicitProducerHashCount.fetch_sub( - 1, std::memory_order_relaxed); - } - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - producer->threadExitListener.callback = - &ConcurrentQueue::implicit_producer_thread_exited_callback; - producer->threadExitListener.userData = producer; - details::ThreadExitNotifier::subscribe( - &producer->threadExitListener); -#endif - - auto index = hashedId; - while (true) - { - index &= mainHash->capacity - 1; - auto probedKey = mainHash->entries[index].key.load( - std::memory_order_relaxed); - - auto empty = details::invalid_thread_id; -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if ((probedKey == empty && - mainHash->entries[index].key.compare_exchange_strong( - empty, - id, - std::memory_order_relaxed, - std::memory_order_relaxed)) || - (probedKey == reusable && - mainHash->entries[index].key.compare_exchange_strong( - reusable, - id, - std::memory_order_acquire, - std::memory_order_acquire))) - { -#else - if ((probedKey == empty && - mainHash->entries[index].key.compare_exchange_strong( - empty, - id, - std::memory_order_relaxed, - std::memory_order_relaxed))) - { -#endif - mainHash->entries[index].value = producer; - break; - } - ++index; - } - return producer; - } - - // Hmm, the old hash is quite full and somebody else is busy - // allocating a new one. We need to wait for the allocating thread - // to finish (if it succeeds, we add, if not, we try to allocate - // ourselves). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - } - } - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - void implicit_producer_thread_exited(ImplicitProducer *producer) - { - // Remove from thread exit listeners - details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); - - // Remove from hash -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - auto hash = implicitProducerHash.load(std::memory_order_acquire); - assert(hash != - nullptr); // The thread exit listener is only registered if we - // were added to a hash in the first place - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - details::thread_id_t probedKey; - - // We need to traverse all the hashes just in case other threads aren't - // on the current one yet and are trying to add an entry thinking - // there's a free slot (because they reused a producer) - for (; hash != nullptr; hash = hash->prev) - { - auto index = hashedId; - do - { - index &= hash->capacity - 1; - probedKey = - hash->entries[index].key.load(std::memory_order_relaxed); - if (probedKey == id) - { - hash->entries[index].key.store(details::invalid_thread_id2, - std::memory_order_release); - break; - } - ++index; - } while (probedKey != - details::invalid_thread_id); // Can happen if the hash has - // changed but we weren't put - // back in it yet, or if we - // weren't added to this hash - // in the first place - } - - // Mark the queue as being recyclable - producer->inactive.store(true, std::memory_order_release); - } - - static void implicit_producer_thread_exited_callback(void *userData) - { - auto producer = static_cast(userData); - auto queue = producer->parent; - queue->implicit_producer_thread_exited(producer); - } -#endif - - ////////////////////////////////// - // Utility functions - ////////////////////////////////// - - template - static inline void *aligned_malloc(size_t size) - { - MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= - std::alignment_of::value) - return (Traits::malloc)(size); - else - { - size_t alignment = std::alignment_of::value; - void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *)); - if (!raw) - return nullptr; - char *ptr = details::align_for( - reinterpret_cast(raw) + sizeof(void *)); - *(reinterpret_cast(ptr) - 1) = raw; - return ptr; - } - } - - template - static inline void aligned_free(void *ptr) - { - MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= - std::alignment_of::value) - return (Traits::free)(ptr); - else(Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) - : nullptr); - } - - template - static inline U *create_array(size_t count) - { - assert(count > 0); - U *p = static_cast(aligned_malloc(sizeof(U) * count)); - if (p == nullptr) - return nullptr; - - for (size_t i = 0; i != count; ++i) - new (p + i) U(); - return p; - } - - template - static inline void destroy_array(U *p, size_t count) - { - if (p != nullptr) - { - assert(count > 0); - for (size_t i = count; i != 0;) - (p + --i)->~U(); - } - aligned_free(p); - } - - template - static inline U *create() - { - void *p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U : nullptr; - } - - template - static inline U *create(A1 &&a1) - { - void *p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; - } - - template - static inline void destroy(U *p) - { - if (p != nullptr) - p->~U(); - aligned_free(p); - } - -private: - std::atomic producerListTail; - std::atomic producerCount; - - std::atomic initialBlockPoolIndex; - Block *initialBlockPool; - size_t initialBlockPoolSize; - -#ifndef MCDBGQ_USEDEBUGFREELIST - FreeList freeList; -#else - debug::DebugFreeList freeList; -#endif - - std::atomic implicitProducerHash; - std::atomic - implicitProducerHashCount; // Number of slots logically used - ImplicitProducerHash initialImplicitProducerHash; - std::array - initialImplicitProducerHashEntries; - std::atomic_flag implicitProducerHashResizeInProgress; - - std::atomic nextExplicitConsumerId; - std::atomic globalExplicitConsumerOffset; - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugMutex implicitProdMutex; -#endif - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - std::atomic explicitProducers; - std::atomic implicitProducers; -#endif -}; - -template -ProducerToken::ProducerToken(ConcurrentQueue &queue) - : producer(queue.recycle_or_create_producer(true)) -{ - if (producer != nullptr) - { - producer->token = this; - } -} - -template -ProducerToken::ProducerToken(BlockingConcurrentQueue &queue) - : producer(reinterpret_cast *>(&queue) - ->recycle_or_create_producer(true)) -{ - if (producer != nullptr) - { - producer->token = this; - } -} - -template -ConsumerToken::ConsumerToken(ConcurrentQueue &queue) - : itemsConsumedFromCurrent(0), - currentProducer(nullptr), - desiredProducer(nullptr) -{ - initialOffset = - queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); -} - -template -ConsumerToken::ConsumerToken(BlockingConcurrentQueue &queue) - : itemsConsumedFromCurrent(0), - currentProducer(nullptr), - desiredProducer(nullptr) -{ - initialOffset = - reinterpret_cast *>(&queue) - ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); -} - -template -inline void swap(ConcurrentQueue &a, - ConcurrentQueue &b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} - -inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} - -inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} - -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, - typename ConcurrentQueue::ImplicitProducerKVP &b) - MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} - -} // namespace moodycamel - -#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) -#pragma warning(pop) -#endif - -#if defined(__GNUC__) -#pragma GCC diagnostic pop -#endif From d48aa3cbd08e42417cb8fc73775573d8fdf63f2b Mon Sep 17 00:00:00 2001 From: KevinChou Date: Fri, 25 Aug 2023 17:30:16 +0800 Subject: [PATCH 06/20] Update src/bthread/parking_lot.cpp Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/bthread/parking_lot.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/bthread/parking_lot.cpp b/src/bthread/parking_lot.cpp index 76ab2b319a..b35a4057f2 100644 --- a/src/bthread/parking_lot.cpp +++ b/src/bthread/parking_lot.cpp @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + #include "parking_lot.h" namespace bthread { From 86048e88aff322cdeb60cd56ca559284ca7c11e1 Mon Sep 17 00:00:00 2001 From: KevinChou Date: Fri, 25 Aug 2023 17:30:25 +0800 Subject: [PATCH 07/20] Update src/bthread/moodycamelqueue.h Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/bthread/moodycamelqueue.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/bthread/moodycamelqueue.h b/src/bthread/moodycamelqueue.h index d0d042f6b3..e6b6123f63 100644 --- a/src/bthread/moodycamelqueue.h +++ b/src/bthread/moodycamelqueue.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + // Provides a C++11 implementation of a multi-producer, multi-consumer lock-free // queue. An overview, including benchmark results, is provided here: // http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ From bc3eaab5bc8b1eebd0384a0f02a45bee13802fd3 Mon Sep 17 00:00:00 2001 From: KevinChou Date: Mon, 4 Sep 2023 17:27:47 +0800 Subject: [PATCH 08/20] Add no_signal parameter to notify_one. (#5) * add no_signal parameter to notify_one * define guard for bthread_cond_signal --- src/bthread/bthread.h | 5 ++++- src/bthread/condition_variable.cpp | 4 ++-- src/bthread/condition_variable.h | 11 +++++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/bthread/bthread.h b/src/bthread/bthread.h index 3f55eb6764..87545ca537 100644 --- a/src/bthread/bthread.h +++ b/src/bthread/bthread.h @@ -196,8 +196,11 @@ extern int bthread_cond_init(bthread_cond_t* __restrict cond, // Destroy condition variable `cond'. extern int bthread_cond_destroy(bthread_cond_t* cond); +#ifndef BTHREAD_COND_SIGNAL +#define BTHREAD_COND_SIGNAL // Wake up one thread waiting for condition variable `cond'. -extern int bthread_cond_signal(bthread_cond_t* cond); +extern int bthread_cond_signal(bthread_cond_t* cond, bool no_signal = false); +#endif // Wake up all threads waiting for condition variables `cond'. extern int bthread_cond_broadcast(bthread_cond_t* cond); diff --git a/src/bthread/condition_variable.cpp b/src/bthread/condition_variable.cpp index e04187d346..cbf586e3f2 100644 --- a/src/bthread/condition_variable.cpp +++ b/src/bthread/condition_variable.cpp @@ -58,14 +58,14 @@ int bthread_cond_destroy(bthread_cond_t* c) { return 0; } -int bthread_cond_signal(bthread_cond_t* c) { +int bthread_cond_signal(bthread_cond_t* c, bool no_signal) { bthread::CondInternal* ic = reinterpret_cast(c); // ic is probably dereferenced after fetch_add, save required fields before // this point butil::atomic* const saved_seq = ic->seq; saved_seq->fetch_add(1, butil::memory_order_release); // don't touch ic any more - bthread::butex_wake(saved_seq); + bthread::butex_wake(saved_seq, no_signal); return 0; } diff --git a/src/bthread/condition_variable.h b/src/bthread/condition_variable.h index c684cf6cbd..c42a4387f5 100644 --- a/src/bthread/condition_variable.h +++ b/src/bthread/condition_variable.h @@ -29,7 +29,10 @@ __BEGIN_DECLS extern int bthread_cond_init(bthread_cond_t* __restrict cond, const bthread_condattr_t* __restrict cond_attr); extern int bthread_cond_destroy(bthread_cond_t* cond); -extern int bthread_cond_signal(bthread_cond_t* cond); +#ifndef BTHREAD_COND_SIGNAL +#define BTHREAD_COND_SIGNAL +extern int bthread_cond_signal(bthread_cond_t* cond, bool no_signal = false); +#endif extern int bthread_cond_broadcast(bthread_cond_t* cond); extern int bthread_cond_wait(bthread_cond_t* __restrict cond, bthread_mutex_t* __restrict mutex); @@ -45,7 +48,7 @@ class ConditionVariable { DISALLOW_COPY_AND_ASSIGN(ConditionVariable); public: typedef bthread_cond_t* native_handler_type; - + ConditionVariable() { CHECK_EQ(0, bthread_cond_init(&_cond, NULL)); } @@ -89,8 +92,8 @@ class ConditionVariable { return rc == ETIMEDOUT ? ETIMEDOUT : 0; } - void notify_one() { - bthread_cond_signal(&_cond); + void notify_one(bool no_signal = false) { + bthread_cond_signal(&_cond, no_signal); } void notify_all() { From 03bc4be7ec956fc5490bf15f87782c0d3b5e0f3d Mon Sep 17 00:00:00 2001 From: weidaolee <43331836+weidaolee@users.noreply.github.com> Date: Thu, 14 Sep 2023 13:32:46 +0800 Subject: [PATCH 09/20] Update minimum virsion requirements for dependancies. (#6) The latest code relies on: * C++11 -> C++17 * Glog minimum version >= 0.6.0 --- CMakeLists.txt | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85881a2eec..a3cd334fe0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,14 +41,14 @@ SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "brpc authors") INCLUDE(CPack) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # require at least gcc 4.8 - if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) - message(FATAL_ERROR "GCC is too old, please install a newer version supporting C++11") + # require at least gcc 8 + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8) # ref: https://gcc.gnu.org/projects/cxx-status.html + message(FATAL_ERROR "GCC is too old, please install a newer version supporting C++17") endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - # require at least clang 3.3 - if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) - message(FATAL_ERROR "Clang is too old, please install a newer version supporting C++11") + # require at least clang 5 + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5) # ref: https://clang.llvm.org/cxx_status.html + message(FATAL_ERROR "Clang is too old, please install a newer version supporting C++17") endif() else() message(WARNING "You are using an unsupported compiler! Compilation has only been tested with Clang and GCC.") @@ -121,21 +121,21 @@ set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEBUG_SYMBOL} ${THRIFT_CPP_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CPP_FLAGS} -O2 -pipe -Wall -W -fPIC -fstrict-aliasing -Wno-invalid-offsetof -Wno-unused-parameter -fno-omit-frame-pointer") set(CMAKE_C_FLAGS "${CMAKE_CPP_FLAGS} -O2 -pipe -Wall -W -fPIC -fstrict-aliasing -Wno-unused-parameter -fno-omit-frame-pointer") -macro(use_cxx11) +macro(use_cxx17) if(CMAKE_VERSION VERSION_LESS "3.1.3") if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") endif() else() - set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) endif() -endmacro(use_cxx11) +endmacro(use_cxx17) -use_cxx11() +use_cxx17() if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") #required by butil/crc32.cc to boost performance for 10x @@ -169,8 +169,9 @@ if(WITH_SNAPPY) endif() if(WITH_GLOG) + message(NOTICE "BRPC WITH_GLOG=ON") find_path(GLOG_INCLUDE_PATH NAMES glog/logging.h) - find_library(GLOG_LIB NAMES glog) + find_library(GLOG_LIB NAMES glog VERSION ">=0.6.0" REQUIRE) if((NOT GLOG_INCLUDE_PATH) OR (NOT GLOG_LIB)) message(FATAL_ERROR "Fail to find glog") endif() From 73fb5a929a7540845cbc23367aba73873cb72f97 Mon Sep 17 00:00:00 2001 From: KevinChou Date: Thu, 14 Sep 2023 19:20:30 +0800 Subject: [PATCH 10/20] change static resume_rq to shared_ptr get from singleton object (#8) --- src/bthread/task_control.cpp | 2 +- src/bthread/task_group.cpp | 7 +++---- src/bthread/task_group.h | 27 +++++++++++++++++++++++---- src/bthread/task_group_inl.h | 12 ++++++------ 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index ceb476ca04..78001618b8 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -432,7 +432,7 @@ void TaskControl::print_resume_q_sizes(std::ostream &os) { // ngroup > _ngroup: nums[_ngroup ... ngroup-1] = 0 // ngroup < _ngroup: just ignore _groups[_ngroup ... ngroup-1] for (size_t i = 0; i < ngroup; ++i) { - nums[i] = (_groups[i] ? _groups[i]->_resume_rq_cnt.load(std::memory_order_relaxed) : 0); + nums[i] = (_groups[i] ? _groups[i]->_resume_rq_cnt->load(std::memory_order_relaxed) : 0); } } for (size_t i = 0; i < ngroup; ++i) { diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 104bd6f5c8..ebe45991aa 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -39,9 +39,6 @@ namespace bthread { -std::atomic TaskGroup::_resume_rq_cnt{0}; -moodycamel::ConcurrentQueue TaskGroup::_resume_rq(10000); - static const bthread_attr_t BTHREAD_ATTR_TASKGROUP = { BTHREAD_STACKTYPE_UNKNOWN, 0, NULL }; @@ -200,7 +197,9 @@ TaskGroup::TaskGroup(TaskControl* c) #ifndef NDEBUG , _sched_recursive_guard(0) #endif - ,_resume_consumer_token(_resume_rq) + , _resume_rq_cnt(ResumeRunQueue::Instance().first) + , _resume_rq(ResumeRunQueue::Instance().second) + , _resume_consumer_token(*_resume_rq) { _steal_seed = butil::fast_rand(); _steal_offset = OFFSET_TABLE[_steal_seed % ARRAY_SIZE(OFFSET_TABLE)]; diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index f29014047c..1da6e3bc32 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -49,6 +49,25 @@ class ExitException : public std::exception { void* _value; }; +// Global resumed tasks. +class ResumeRunQueue { +public: + static std::pair>, + std::shared_ptr>> Instance() { + static ResumeRunQueue instance; + return {instance.queue_size_, instance.concurrent_queue_}; + } + +private: + ResumeRunQueue() { + queue_size_ = std::make_shared>(0); + concurrent_queue_ = std::make_shared>(10000); + } + + std::shared_ptr> queue_size_; + std::shared_ptr> concurrent_queue_; +}; + // Thread-local group of tasks. // Notice that most methods involving context switching are static otherwise // pointer `this' may change after wakeup. The **pg parameters in following @@ -95,7 +114,7 @@ class TaskGroup { _last_context_remained = cb; _last_context_remained_arg = arg; } - + // Suspend caller for at least |timeout_us| microseconds. // If |timeout_us| is 0, this function does nothing. // If |group| is NULL or current thread is non-bthread, call usleep(3) @@ -227,7 +246,7 @@ friend class TaskControl; } TaskMeta* _cur_meta; - + // the control that this group belongs to TaskControl* _control; int _num_nosignal; @@ -255,8 +274,8 @@ friend class TaskControl; int _sched_recursive_guard; - static std::atomic _resume_rq_cnt; - static moodycamel::ConcurrentQueue _resume_rq; + std::shared_ptr> _resume_rq_cnt; + std::shared_ptr> _resume_rq; moodycamel::ConsumerToken _resume_consumer_token; }; diff --git a/src/bthread/task_group_inl.h b/src/bthread/task_group_inl.h index de42add385..f2041e147c 100644 --- a/src/bthread/task_group_inl.h +++ b/src/bthread/task_group_inl.h @@ -98,21 +98,21 @@ inline void TaskGroup::push_rq(bthread_t tid) { } inline bool TaskGroup::pop_resume_task(bthread_t* tid) { - int tmp_cnt = _resume_rq_cnt.load(std::memory_order_relaxed); - if (tmp_cnt>0 && _resume_rq_cnt.compare_exchange_strong(tmp_cnt, tmp_cnt-1)){ - if(_resume_rq.try_dequeue(_resume_consumer_token, *tid)){ + int tmp_cnt = _resume_rq_cnt->load(std::memory_order_relaxed); + if (tmp_cnt>0 && _resume_rq_cnt->compare_exchange_strong(tmp_cnt, tmp_cnt-1)){ + if(_resume_rq->try_dequeue(_resume_consumer_token, *tid)){ return true; } else { - _resume_rq_cnt ++; + (*_resume_rq_cnt) ++; } } return false; } inline bool TaskGroup::push_resume_task(bthread_t tid){ - if(_resume_rq.enqueue(tid)){ - _resume_rq_cnt ++; + if(_resume_rq->enqueue(tid)){ + (*_resume_rq_cnt) ++; return true; } return false; From 0e8e5a40841cff1f1eed61e36c22d606e1fc0af9 Mon Sep 17 00:00:00 2001 From: Hubert Zhang Date: Fri, 15 Sep 2023 11:34:05 +0800 Subject: [PATCH 11/20] Add memory header file --- src/bthread/task_group.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index 1da6e3bc32..dcdf33b75d 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -22,6 +22,8 @@ #ifndef BTHREAD_TASK_GROUP_H #define BTHREAD_TASK_GROUP_H +#include // shared_ptr + #include "butil/time.h" // cpuwide_time_ns #include "bthread/task_control.h" #include "bthread/task_meta.h" // bthread_t, TaskMeta From 62a3c882910a176422af2bd2aab99328d9cf8d3e Mon Sep 17 00:00:00 2001 From: KevinChou Date: Fri, 15 Sep 2023 13:01:00 +0800 Subject: [PATCH 12/20] include headers (#9) --- src/bthread/task_group.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index dcdf33b75d..cc8c4f1398 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -23,6 +23,7 @@ #define BTHREAD_TASK_GROUP_H #include // shared_ptr +#include #include "butil/time.h" // cpuwide_time_ns #include "bthread/task_control.h" From c9b7ad5c570858c1ba3cb44188123994fe2c84ce Mon Sep 17 00:00:00 2001 From: KevinChou Date: Mon, 25 Sep 2023 18:29:25 +0800 Subject: [PATCH 13/20] set default behaviour for bthread_cond_signal to no signal (#7) --- src/bthread/bthread.h | 5 +---- src/bthread/condition_variable.cpp | 3 ++- src/bthread/condition_variable.h | 9 +++------ 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/bthread/bthread.h b/src/bthread/bthread.h index 87545ca537..3f55eb6764 100644 --- a/src/bthread/bthread.h +++ b/src/bthread/bthread.h @@ -196,11 +196,8 @@ extern int bthread_cond_init(bthread_cond_t* __restrict cond, // Destroy condition variable `cond'. extern int bthread_cond_destroy(bthread_cond_t* cond); -#ifndef BTHREAD_COND_SIGNAL -#define BTHREAD_COND_SIGNAL // Wake up one thread waiting for condition variable `cond'. -extern int bthread_cond_signal(bthread_cond_t* cond, bool no_signal = false); -#endif +extern int bthread_cond_signal(bthread_cond_t* cond); // Wake up all threads waiting for condition variables `cond'. extern int bthread_cond_broadcast(bthread_cond_t* cond); diff --git a/src/bthread/condition_variable.cpp b/src/bthread/condition_variable.cpp index cbf586e3f2..667c499b42 100644 --- a/src/bthread/condition_variable.cpp +++ b/src/bthread/condition_variable.cpp @@ -58,13 +58,14 @@ int bthread_cond_destroy(bthread_cond_t* c) { return 0; } -int bthread_cond_signal(bthread_cond_t* c, bool no_signal) { +int bthread_cond_signal(bthread_cond_t* c) { bthread::CondInternal* ic = reinterpret_cast(c); // ic is probably dereferenced after fetch_add, save required fields before // this point butil::atomic* const saved_seq = ic->seq; saved_seq->fetch_add(1, butil::memory_order_release); // don't touch ic any more + bool no_signal = true; bthread::butex_wake(saved_seq, no_signal); return 0; } diff --git a/src/bthread/condition_variable.h b/src/bthread/condition_variable.h index c42a4387f5..868ee0ab72 100644 --- a/src/bthread/condition_variable.h +++ b/src/bthread/condition_variable.h @@ -29,10 +29,7 @@ __BEGIN_DECLS extern int bthread_cond_init(bthread_cond_t* __restrict cond, const bthread_condattr_t* __restrict cond_attr); extern int bthread_cond_destroy(bthread_cond_t* cond); -#ifndef BTHREAD_COND_SIGNAL -#define BTHREAD_COND_SIGNAL -extern int bthread_cond_signal(bthread_cond_t* cond, bool no_signal = false); -#endif +extern int bthread_cond_signal(bthread_cond_t* cond); extern int bthread_cond_broadcast(bthread_cond_t* cond); extern int bthread_cond_wait(bthread_cond_t* __restrict cond, bthread_mutex_t* __restrict mutex); @@ -92,8 +89,8 @@ class ConditionVariable { return rc == ETIMEDOUT ? ETIMEDOUT : 0; } - void notify_one(bool no_signal = false) { - bthread_cond_signal(&_cond, no_signal); + void notify_one() { + bthread_cond_signal(&_cond); } void notify_all() { From 2986f4dc5f74bd49ed47ba1553f2f92ed30c23ee Mon Sep 17 00:00:00 2001 From: KevinChou Date: Wed, 13 Dec 2023 16:39:40 +0800 Subject: [PATCH 14/20] fix the problem that butex_wake does not signal pending tasks (#13) --- src/bthread/condition_variable.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bthread/condition_variable.cpp b/src/bthread/condition_variable.cpp index 667c499b42..e664d24766 100644 --- a/src/bthread/condition_variable.cpp +++ b/src/bthread/condition_variable.cpp @@ -43,6 +43,7 @@ extern "C" { extern int bthread_mutex_unlock(bthread_mutex_t*); extern int bthread_mutex_lock_contended(bthread_mutex_t*); +extern void bthread_flush(); int bthread_cond_init(bthread_cond_t* __restrict c, const bthread_condattr_t*) { @@ -67,6 +68,8 @@ int bthread_cond_signal(bthread_cond_t* c) { // don't touch ic any more bool no_signal = true; bthread::butex_wake(saved_seq, no_signal); + // flush unsignaled tasks manually + bthread_flush(); return 0; } From 846f5ac0eb0b4764e8428c3caaf2c73cbf009df1 Mon Sep 17 00:00:00 2001 From: KevinChou Date: Thu, 14 Dec 2023 17:41:55 +0800 Subject: [PATCH 15/20] Redis transaction support. (#12) * change redis txn and support watch * update redis multi unit test --- src/brpc/policy/redis_protocol.cpp | 39 ++++++++++++-- src/brpc/redis.cpp | 5 ++ src/brpc/redis.h | 11 ++++ test/brpc_redis_unittest.cpp | 82 +++++++++++++++++------------- 4 files changed, 97 insertions(+), 40 deletions(-) diff --git a/src/brpc/policy/redis_protocol.cpp b/src/brpc/policy/redis_protocol.cpp index 5e92453ede..de1b5e4bcb 100644 --- a/src/brpc/policy/redis_protocol.cpp +++ b/src/brpc/policy/redis_protocol.cpp @@ -60,6 +60,7 @@ class RedisConnContext : public Destroyable { public: explicit RedisConnContext(const RedisService* rs) : redis_service(rs) + , in_transaction(false) , batched_size(0) {} ~RedisConnContext(); @@ -69,7 +70,10 @@ class RedisConnContext : public Destroyable { const RedisService* redis_service; // If user starts a transaction, transaction_handler indicates the // handler pointer that runs the transaction command. - std::unique_ptr transaction_handler; + std::unique_ptr transaction_handler; + // Whether this connection has begun a transaction. If true, the commands + // received will be handled by transaction_handler. + bool in_transaction; // >0 if command handler is run in batched mode. int batched_size; @@ -83,15 +87,33 @@ int ConsumeCommand(RedisConnContext* ctx, butil::IOBufAppender* appender) { RedisReply output(&ctx->arena); RedisCommandHandlerResult result = REDIS_CMD_HANDLED; - if (ctx->transaction_handler) { + if (ctx->in_transaction) { + assert(ctx->transaction_handler != nullptr); result = ctx->transaction_handler->Run(args, &output, flush_batched); if (result == REDIS_CMD_HANDLED) { ctx->transaction_handler.reset(NULL); + ctx->in_transaction = false; } else if (result == REDIS_CMD_BATCHED) { LOG(ERROR) << "BATCHED should not be returned by a transaction handler."; return -1; } - } else { + } + else if (args[0] == "watch" || args[0] == "unwatch") { + if (!ctx->transaction_handler) { + ctx->transaction_handler.reset(ctx->redis_service->NewTransactionHandler()); + ctx->in_transaction = false; + } + if (!ctx->transaction_handler) { + output.SetError("ERR Transaction not supported."); + } else { + result = ctx->transaction_handler->Run(args, &output, flush_batched); + if (result == REDIS_CMD_BATCHED) { + LOG(ERROR) << "BATCHED should not be returned by a transaction handler."; + return -1; + } + } + } + else { RedisCommandHandler* ch = ctx->redis_service->FindCommandHandler(args[0]); if (!ch) { char buf[64]; @@ -104,7 +126,16 @@ int ConsumeCommand(RedisConnContext* ctx, LOG(ERROR) << "CONTINUE should not be returned in a batched process."; return -1; } - ctx->transaction_handler.reset(ch->NewTransactionHandler()); + if (ctx->transaction_handler == nullptr) { + ctx->transaction_handler.reset(ctx->redis_service->NewTransactionHandler()); + } + if (ctx->transaction_handler != nullptr) { + ctx->transaction_handler->Begin(); + ctx->in_transaction = true; + } + else { + output.SetError("ERR Transaction not supported."); + } } else if (result == REDIS_CMD_BATCHED) { ctx->batched_size++; } diff --git a/src/brpc/redis.cpp b/src/brpc/redis.cpp index 073136102e..24f99abf0d 100644 --- a/src/brpc/redis.cpp +++ b/src/brpc/redis.cpp @@ -467,6 +467,11 @@ RedisCommandHandler* RedisService::FindCommandHandler(const butil::StringPiece& return NULL; } +TransactionHandler* RedisService::NewTransactionHandler() const { + LOG(ERROR) << "NewTransactionHandler is not implemented"; + return NULL; +} + RedisCommandHandler* RedisCommandHandler::NewTransactionHandler() { LOG(ERROR) << "NewTransactionHandler is not implemented"; return NULL; diff --git a/src/brpc/redis.h b/src/brpc/redis.h index d02e894121..21d4f47ab8 100644 --- a/src/brpc/redis.h +++ b/src/brpc/redis.h @@ -221,6 +221,7 @@ std::ostream& operator<<(std::ostream& os, const RedisRequest&); std::ostream& operator<<(std::ostream& os, const RedisResponse&); class RedisCommandHandler; +class TransactionHandler; // Container of CommandHandlers. // Assign an instance to ServerOption.redis_service to enable redis support. @@ -231,6 +232,9 @@ class RedisService { // Call this function to register `handler` that can handle command `name`. bool AddCommandHandler(const std::string& name, RedisCommandHandler* handler); + // Create a transaction handler to handle commands inside a transaction. + virtual TransactionHandler* NewTransactionHandler() const; + // This function should not be touched by user and used by brpc deverloper only. RedisCommandHandler* FindCommandHandler(const butil::StringPiece& name) const; @@ -243,6 +247,8 @@ enum RedisCommandHandlerResult { REDIS_CMD_HANDLED = 0, REDIS_CMD_CONTINUE = 1, REDIS_CMD_BATCHED = 2, + REDIS_CMD_TXN_START = 3, + REDIS_CMD_TXN_FINISH = 4, }; // The Command handler for a redis request. User should impletement Run(). @@ -289,6 +295,11 @@ class RedisCommandHandler { virtual RedisCommandHandler* NewTransactionHandler(); }; +class TransactionHandler : public RedisCommandHandler { +public: + virtual bool Begin() = 0; +}; + } // namespace brpc #endif // BRPC_REDIS_H diff --git a/test/brpc_redis_unittest.cpp b/test/brpc_redis_unittest.cpp index 1176676c95..615fefb6bf 100644 --- a/test/brpc_redis_unittest.cpp +++ b/test/brpc_redis_unittest.cpp @@ -811,6 +811,47 @@ butil::Mutex s_mutex; std::unordered_map m; std::unordered_map int_map; +class MultiTransactionHandler : public brpc::TransactionHandler { +public: + brpc::RedisCommandHandlerResult Run(const std::vector& args, + brpc::RedisReply* output, + bool flush_batched) { + if (args[0] == "multi") { + output->SetError("ERR duplicate multi"); + return brpc::REDIS_CMD_CONTINUE; + } + if (args[0] != "exec") { + std::vector comm; + for (int i = 0; i < (int)args.size(); ++i) { + comm.push_back(args[i].as_string()); + } + _commands.push_back(comm); + output->SetStatus("QUEUED"); + return brpc::REDIS_CMD_CONTINUE; + } + output->SetArray(_commands.size()); + s_mutex.lock(); + for (size_t i = 0; i < _commands.size(); ++i) { + if (_commands[i][0] == "incr") { + int64_t value; + value = ++int_map[_commands[i][1]]; + (*output)[i].SetInteger(value); + } else { + (*output)[i].SetStatus("unknown command"); + } + } + s_mutex.unlock(); + return brpc::REDIS_CMD_HANDLED; + } + + bool Begin() override { + return true; + } + +private: + std::vector > _commands; +}; + class RedisServiceImpl : public brpc::RedisService { public: RedisServiceImpl() @@ -862,6 +903,11 @@ class RedisServiceImpl : public brpc::RedisService { } } + brpc::TransactionHandler* NewTransactionHandler() const override { + + return new MultiTransactionHandler; + } + std::vector > _batched_command; int _batch_count; }; @@ -1088,42 +1134,6 @@ class MultiCommandHandler : public brpc::RedisCommandHandler { RedisCommandHandler* NewTransactionHandler() override { return new MultiTransactionHandler; } - - class MultiTransactionHandler : public brpc::RedisCommandHandler { - public: - brpc::RedisCommandHandlerResult Run(const std::vector& args, - brpc::RedisReply* output, - bool flush_batched) { - if (args[0] == "multi") { - output->SetError("ERR duplicate multi"); - return brpc::REDIS_CMD_CONTINUE; - } - if (args[0] != "exec") { - std::vector comm; - for (int i = 0; i < (int)args.size(); ++i) { - comm.push_back(args[i].as_string()); - } - _commands.push_back(comm); - output->SetStatus("QUEUED"); - return brpc::REDIS_CMD_CONTINUE; - } - output->SetArray(_commands.size()); - s_mutex.lock(); - for (size_t i = 0; i < _commands.size(); ++i) { - if (_commands[i][0] == "incr") { - int64_t value; - value = ++int_map[_commands[i][1]]; - (*output)[i].SetInteger(value); - } else { - (*output)[i].SetStatus("unknown command"); - } - } - s_mutex.unlock(); - return brpc::REDIS_CMD_HANDLED; - } - private: - std::vector > _commands; - }; }; TEST_F(RedisTest, server_command_continue) { From a70b93f73a43a646eb842151f162dd91deb748b7 Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Wed, 27 Sep 2023 15:55:20 +0800 Subject: [PATCH 16/20] local resume_rq each task group --- src/bthread/task_control.cpp | 6 +++++- src/bthread/task_group.cpp | 6 +++--- src/bthread/task_group.h | 23 ++--------------------- src/bthread/task_group_inl.h | 12 ++++++------ 4 files changed, 16 insertions(+), 31 deletions(-) diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index 78001618b8..042b9799b9 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -362,6 +362,10 @@ bool TaskControl::steal_task(bthread_t* tid, size_t* seed, size_t offset) { TaskGroup* g = _groups[s % ngroup]; // g is possibly NULL because of concurrent _destroy_group if (g) { + if (g->pop_resume_task(tid)) { + stolen = true; + break; + } if (g->_rq.steal(tid)) { stolen = true; break; @@ -432,7 +436,7 @@ void TaskControl::print_resume_q_sizes(std::ostream &os) { // ngroup > _ngroup: nums[_ngroup ... ngroup-1] = 0 // ngroup < _ngroup: just ignore _groups[_ngroup ... ngroup-1] for (size_t i = 0; i < ngroup; ++i) { - nums[i] = (_groups[i] ? _groups[i]->_resume_rq_cnt->load(std::memory_order_relaxed) : 0); + nums[i] = (_groups[i] ? _groups[i]->_resume_rq_cnt.load(std::memory_order_relaxed) : 0); } } for (size_t i = 0; i < ngroup; ++i) { diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index ebe45991aa..1f563efcf9 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -197,9 +197,9 @@ TaskGroup::TaskGroup(TaskControl* c) #ifndef NDEBUG , _sched_recursive_guard(0) #endif - , _resume_rq_cnt(ResumeRunQueue::Instance().first) - , _resume_rq(ResumeRunQueue::Instance().second) - , _resume_consumer_token(*_resume_rq) + , _resume_rq_cnt(0) + , _resume_rq(1000) + , _resume_consumer_token(_resume_rq) { _steal_seed = butil::fast_rand(); _steal_offset = OFFSET_TABLE[_steal_seed % ARRAY_SIZE(OFFSET_TABLE)]; diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index cc8c4f1398..f4b0c9db17 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -52,25 +52,6 @@ class ExitException : public std::exception { void* _value; }; -// Global resumed tasks. -class ResumeRunQueue { -public: - static std::pair>, - std::shared_ptr>> Instance() { - static ResumeRunQueue instance; - return {instance.queue_size_, instance.concurrent_queue_}; - } - -private: - ResumeRunQueue() { - queue_size_ = std::make_shared>(0); - concurrent_queue_ = std::make_shared>(10000); - } - - std::shared_ptr> queue_size_; - std::shared_ptr> concurrent_queue_; -}; - // Thread-local group of tasks. // Notice that most methods involving context switching are static otherwise // pointer `this' may change after wakeup. The **pg parameters in following @@ -277,8 +258,8 @@ friend class TaskControl; int _sched_recursive_guard; - std::shared_ptr> _resume_rq_cnt; - std::shared_ptr> _resume_rq; + std::atomic _resume_rq_cnt; + moodycamel::ConcurrentQueue _resume_rq; moodycamel::ConsumerToken _resume_consumer_token; }; diff --git a/src/bthread/task_group_inl.h b/src/bthread/task_group_inl.h index f2041e147c..300cccd40d 100644 --- a/src/bthread/task_group_inl.h +++ b/src/bthread/task_group_inl.h @@ -98,21 +98,21 @@ inline void TaskGroup::push_rq(bthread_t tid) { } inline bool TaskGroup::pop_resume_task(bthread_t* tid) { - int tmp_cnt = _resume_rq_cnt->load(std::memory_order_relaxed); - if (tmp_cnt>0 && _resume_rq_cnt->compare_exchange_strong(tmp_cnt, tmp_cnt-1)){ - if(_resume_rq->try_dequeue(_resume_consumer_token, *tid)){ + int tmp_cnt = _resume_rq_cnt.load(std::memory_order_relaxed); + if (tmp_cnt > 0 && _resume_rq_cnt.compare_exchange_strong(tmp_cnt, tmp_cnt-1)){ + if(_resume_rq.try_dequeue(_resume_consumer_token, *tid)){ return true; } else { - (*_resume_rq_cnt) ++; + _resume_rq_cnt++; } } return false; } inline bool TaskGroup::push_resume_task(bthread_t tid){ - if(_resume_rq->enqueue(tid)){ - (*_resume_rq_cnt) ++; + if(_resume_rq.enqueue(tid)){ + _resume_rq_cnt++; return true; } return false; From 6686bfea667220b6e8ad33ec0533b5ebe4199dc9 Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Fri, 13 Oct 2023 18:26:44 +0800 Subject: [PATCH 17/20] wait_task busy loop before waiting on PL --- src/bthread/task_group.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 1f563efcf9..93e8f82d8b 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -116,17 +116,23 @@ bool TaskGroup::is_stopped(bthread_t tid) { } bool TaskGroup::wait_task(bthread_t* tid) { + int64_t poll_start_ms = butil::cpuwide_time_ms(); do { #ifndef BTHREAD_DONT_SAVE_PARKING_STATE if (_last_pl_state.stopped()) { return false; } - if (pop_resume_task(tid)) { + if (pop_resume_task(tid) || steal_task(tid)) { return true; } - _pl->wait(_last_pl_state); + // keep polling for some time before waiting on parking lot + if (butil::cpuwide_time_ms() - poll_start_ms > 100) { + _pl->wait(_last_pl_state); + poll_start_ms = butil::cpuwide_time_ms(); + } + if (steal_task(tid)) { return true; } From 7e81e6e5d65244ac1a29837f7787a1f7850497bb Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Fri, 13 Oct 2023 18:31:01 +0800 Subject: [PATCH 18/20] add bvar ready_to_run_skip_signal_task_per_second --- src/bthread/task_group.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 93e8f82d8b..2e0bb49363 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -571,7 +571,7 @@ void TaskGroup::ending_sched(TaskGroup** pg) { void TaskGroup::sched(TaskGroup** pg) { TaskGroup* g = *pg; bthread_t next_tid = 0; - + if (!g->pop_resume_task(&next_tid)) { // Find next task to run, if none, switch to idle thread of the group. #ifndef BTHREAD_FAIR_WSQ @@ -670,10 +670,15 @@ void TaskGroup::destroy_self() { } } +bvar::Adder ready_to_run_skip_cnt; +bvar::PerSecond> ready_to_run_skip_ps( + "ready_to_run_skip_signal_task_per_second", + &ready_to_run_skip_cnt, 2); void TaskGroup::ready_to_run(bthread_t tid, bool nosignal) { push_rq(tid); if (nosignal || ParkingLot::_waiting_worker_count == 0) { ++_num_nosignal; + ready_to_run_skip_cnt << 1; } else { const int additional_signal = _num_nosignal; _num_nosignal = 0; From 8009b201d10d433bef4b891e0f3a05587c7ceb8e Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Tue, 9 Jan 2024 15:50:11 +0800 Subject: [PATCH 19/20] change wait_task busy poll time from 100ms to 15ms --- src/bthread/task_group.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 2e0bb49363..609f25ab1b 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -128,7 +128,7 @@ bool TaskGroup::wait_task(bthread_t* tid) { } // keep polling for some time before waiting on parking lot - if (butil::cpuwide_time_ms() - poll_start_ms > 100) { + if (butil::cpuwide_time_ms() - poll_start_ms > 15) { _pl->wait(_last_pl_state); poll_start_ms = butil::cpuwide_time_ms(); } From 6269bf1fe6e2d2b7b72210ed21b3d7ccd49c787d Mon Sep 17 00:00:00 2001 From: Kevin Chou Date: Tue, 9 Jan 2024 19:19:18 +0800 Subject: [PATCH 20/20] check waiting_worker_num in signal_task --- src/bthread/task_control.cpp | 19 +++++++++++++++++++ src/bthread/task_group.cpp | 9 ++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index 042b9799b9..0d337195c4 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -380,10 +380,29 @@ bool TaskControl::steal_task(bthread_t* tid, size_t* seed, size_t offset) { return stolen; } +bvar::Adder signal_task_skip_cnt; +bvar::PerSecond> signal_task_skip_ps( + "signal_task_skip_signal_task_per_second", + &signal_task_skip_cnt, 2); void TaskControl::signal_task(int num_task) { if (num_task <= 0) { + signal_task_skip_cnt << 1; return; } + if (ParkingLot::_waiting_worker_count.load(butil::memory_order_acquire) == 0) { + if (FLAGS_bthread_min_concurrency > 0 && + _concurrency.load(butil::memory_order_relaxed) < FLAGS_bthread_concurrency) { + // Add worker if all workers are busy and FLAGS_bthread_concurrency is + // not reached. + BAIDU_SCOPED_LOCK(g_task_control_mutex); + if (_concurrency.load(butil::memory_order_acquire) < FLAGS_bthread_concurrency) { + add_workers(1); + } + } + signal_task_skip_cnt << 1; + return; + } + // TODO(gejun): Current algorithm does not guarantee enough threads will // be created to match caller's requests. But in another side, there's also // many useless signalings according to current impl. Capping the concurrency diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 609f25ab1b..2ea9e29d68 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -670,15 +670,10 @@ void TaskGroup::destroy_self() { } } -bvar::Adder ready_to_run_skip_cnt; -bvar::PerSecond> ready_to_run_skip_ps( - "ready_to_run_skip_signal_task_per_second", - &ready_to_run_skip_cnt, 2); void TaskGroup::ready_to_run(bthread_t tid, bool nosignal) { push_rq(tid); - if (nosignal || ParkingLot::_waiting_worker_count == 0) { + if (nosignal) { ++_num_nosignal; - ready_to_run_skip_cnt << 1; } else { const int additional_signal = _num_nosignal; _num_nosignal = 0; @@ -701,7 +696,7 @@ void TaskGroup::ready_to_run_remote(bthread_t tid, bool nosignal) { LOG_EVERY_SECOND(ERROR) << "push_resume_rq fail"; ::usleep(1000); } - if (nosignal || ParkingLot::_waiting_worker_count == 0) { + if (nosignal) { ++_remote_num_nosignal; } else { const int additional_signal = _remote_num_nosignal;