Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / libcxx / docs / DesignDocs / AtomicDesign.rst
blob4b28ab2a8218a96ff41a26d68c028c657c1c0d23
2 ====================
3 ``<atomic>`` Design
4 ====================
6 There were originally 3 designs under consideration. They differ in where most
7 of the implementation work is done. The functionality exposed to the customer
8 should be identical (and conforming) for all three designs.
11 Design A: Minimal work for the library
12 ======================================
13 The compiler supplies all of the intrinsics as described below. This list of
14 intrinsics roughly parallels the requirements of the C and C++ atomics proposals.
15 The C and C++ library implementations simply drop through to these intrinsics.
16 Anything the platform does not support in hardware, the compiler
17 arranges for a (compiler-rt) library call to be made which will do the job with
18 a mutex, and in this case ignoring the memory ordering parameter (effectively
19 implementing ``memory_order_seq_cst``).
21 Ultimate efficiency is preferred over run time error checking. Undefined
22 behavior is acceptable when the inputs do not conform as defined below.
24 .. code-block:: cpp
26     // In every intrinsic signature below, type* atomic_obj may be a pointer to a
27     // volatile-qualified type. Memory ordering values map to the following meanings:
28     //  memory_order_relaxed == 0
29     //  memory_order_consume == 1
30     //  memory_order_acquire == 2
31     //  memory_order_release == 3
32     //  memory_order_acq_rel == 4
33     //  memory_order_seq_cst == 5
35     // type must be trivially copyable
36     // type represents a "type argument"
37     bool __atomic_is_lock_free(type);
39     // type must be trivially copyable
40     // Behavior is defined for mem_ord = 0, 1, 2, 5
41     type __atomic_load(const type* atomic_obj, int mem_ord);
43     // type must be trivially copyable
44     // Behavior is defined for mem_ord = 0, 3, 5
45     void __atomic_store(type* atomic_obj, type desired, int mem_ord);
47     // type must be trivially copyable
48     // Behavior is defined for mem_ord = [0 ... 5]
49     type __atomic_exchange(type* atomic_obj, type desired, int mem_ord);
51     // type must be trivially copyable
52     // Behavior is defined for mem_success = [0 ... 5],
53     //   mem_failure <= mem_success
54     //   mem_failure != 3
55     //   mem_failure != 4
56     bool __atomic_compare_exchange_strong(type* atomic_obj,
57                                         type* expected, type desired,
58                                         int mem_success, int mem_failure);
60     // type must be trivially copyable
61     // Behavior is defined for mem_success = [0 ... 5],
62     //   mem_failure <= mem_success
63     //   mem_failure != 3
64     //   mem_failure != 4
65     bool __atomic_compare_exchange_weak(type* atomic_obj,
66                                         type* expected, type desired,
67                                         int mem_success, int mem_failure);
69     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
70     //      unsigned int, long, unsigned long, long long, unsigned long long,
71     //      char16_t, char32_t, wchar_t
72     // Behavior is defined for mem_ord = [0 ... 5]
73     type __atomic_fetch_add(type* atomic_obj, type operand, int mem_ord);
75     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
76     //      unsigned int, long, unsigned long, long long, unsigned long long,
77     //      char16_t, char32_t, wchar_t
78     // Behavior is defined for mem_ord = [0 ... 5]
79     type __atomic_fetch_sub(type* atomic_obj, type operand, int mem_ord);
81     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
82     //      unsigned int, long, unsigned long, long long, unsigned long long,
83     //      char16_t, char32_t, wchar_t
84     // Behavior is defined for mem_ord = [0 ... 5]
85     type __atomic_fetch_and(type* atomic_obj, type operand, int mem_ord);
87     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
88     //      unsigned int, long, unsigned long, long long, unsigned long long,
89     //      char16_t, char32_t, wchar_t
90     // Behavior is defined for mem_ord = [0 ... 5]
91     type __atomic_fetch_or(type* atomic_obj, type operand, int mem_ord);
93     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
94     //      unsigned int, long, unsigned long, long long, unsigned long long,
95     //      char16_t, char32_t, wchar_t
96     // Behavior is defined for mem_ord = [0 ... 5]
97     type __atomic_fetch_xor(type* atomic_obj, type operand, int mem_ord);
99     // Behavior is defined for mem_ord = [0 ... 5]
100     void* __atomic_fetch_add(void** atomic_obj, ptrdiff_t operand, int mem_ord);
101     void* __atomic_fetch_sub(void** atomic_obj, ptrdiff_t operand, int mem_ord);
103     // Behavior is defined for mem_ord = [0 ... 5]
104     void __atomic_thread_fence(int mem_ord);
105     void __atomic_signal_fence(int mem_ord);
107 If desired the intrinsics taking a single ``mem_ord`` parameter can default
108 this argument to 5.
110 If desired the intrinsics taking two ordering parameters can default ``mem_success``
111 to 5, and ``mem_failure`` to ``translate_memory_order(mem_success)`` where
112 ``translate_memory_order(mem_success)`` is defined as:
114 .. code-block:: cpp
116     int translate_memory_order(int o) {
117         switch (o) {
118         case 4:
119             return 2;
120         case 3:
121             return 0;
122         }
123         return o;
124     }
126 Below are representative C++ implementations of all of the operations. Their
127 purpose is to document the desired semantics of each operation, assuming
128 ``memory_order_seq_cst``. This is essentially the code that will be called
129 if the front end calls out to compiler-rt.
131 .. code-block:: cpp
133     template <class T>
134     T __atomic_load(T const volatile* obj) {
135         unique_lock<mutex> _(some_mutex);
136         return *obj;
137     }
139     template <class T>
140     void __atomic_store(T volatile* obj, T desr) {
141         unique_lock<mutex> _(some_mutex);
142         *obj = desr;
143     }
145     template <class T>
146     T __atomic_exchange(T volatile* obj, T desr) {
147         unique_lock<mutex> _(some_mutex);
148         T r = *obj;
149         *obj = desr;
150         return r;
151     }
153     template <class T>
154     bool __atomic_compare_exchange_strong(T volatile* obj, T* exp, T desr) {
155         unique_lock<mutex> _(some_mutex);
156         if (std::memcmp(const_cast<T*>(obj), exp, sizeof(T)) == 0) // if (*obj == *exp)
157         {
158             std::memcpy(const_cast<T*>(obj), &desr, sizeof(T)); // *obj = desr;
159             return true;
160         }
161         std::memcpy(exp, const_cast<T*>(obj), sizeof(T)); // *exp = *obj;
162         return false;
163     }
165     // May spuriously return false (even if *obj == *exp)
166     template <class T>
167     bool __atomic_compare_exchange_weak(T volatile* obj, T* exp, T desr) {
168         unique_lock<mutex> _(some_mutex);
169         if (std::memcmp(const_cast<T*>(obj), exp, sizeof(T)) == 0) // if (*obj == *exp)
170         {
171             std::memcpy(const_cast<T*>(obj), &desr, sizeof(T)); // *obj = desr;
172             return true;
173         }
174         std::memcpy(exp, const_cast<T*>(obj), sizeof(T)); // *exp = *obj;
175         return false;
176     }
178     template <class T>
179     T __atomic_fetch_add(T volatile* obj, T operand) {
180         unique_lock<mutex> _(some_mutex);
181         T r = *obj;
182         *obj += operand;
183         return r;
184     }
186     template <class T>
187     T __atomic_fetch_sub(T volatile* obj, T operand) {
188         unique_lock<mutex> _(some_mutex);
189         T r = *obj;
190         *obj -= operand;
191         return r;
192     }
194     template <class T>
195     T __atomic_fetch_and(T volatile* obj, T operand) {
196         unique_lock<mutex> _(some_mutex);
197         T r = *obj;
198         *obj &= operand;
199         return r;
200     }
202     template <class T>
203     T __atomic_fetch_or(T volatile* obj, T operand) {
204         unique_lock<mutex> _(some_mutex);
205         T r = *obj;
206         *obj |= operand;
207         return r;
208     }
210     template <class T>
211     T __atomic_fetch_xor(T volatile* obj, T operand) {
212         unique_lock<mutex> _(some_mutex);
213         T r = *obj;
214         *obj ^= operand;
215         return r;
216     }
218     void* __atomic_fetch_add(void* volatile* obj, ptrdiff_t operand) {
219         unique_lock<mutex> _(some_mutex);
220         void* r = *obj;
221         (char*&)(*obj) += operand;
222         return r;
223     }
225     void* __atomic_fetch_sub(void* volatile* obj, ptrdiff_t operand) {
226         unique_lock<mutex> _(some_mutex);
227         void* r = *obj;
228         (char*&)(*obj) -= operand;
229         return r;
230     }
232     void __atomic_thread_fence() {
233         unique_lock<mutex> _(some_mutex);
234     }
236     void __atomic_signal_fence() {
237         unique_lock<mutex> _(some_mutex);
238     }
241 Design B: Something in between
242 ==============================
243 This is a variation of design A which puts the burden on the library to arrange
244 for the correct manipulation of the run time memory ordering arguments, and only
245 calls the compiler for well-defined memory orderings. I think of this design as
246 the worst of A and C, instead of the best of A and C. But I offer it as an
247 option in the spirit of completeness.
249 .. code-block:: cpp
251     // type must be trivially copyable
252     bool __atomic_is_lock_free(const type* atomic_obj);
254     // type must be trivially copyable
255     type __atomic_load_relaxed(const volatile type* atomic_obj);
256     type __atomic_load_consume(const volatile type* atomic_obj);
257     type __atomic_load_acquire(const volatile type* atomic_obj);
258     type __atomic_load_seq_cst(const volatile type* atomic_obj);
260     // type must be trivially copyable
261     type __atomic_store_relaxed(volatile type* atomic_obj, type desired);
262     type __atomic_store_release(volatile type* atomic_obj, type desired);
263     type __atomic_store_seq_cst(volatile type* atomic_obj, type desired);
265     // type must be trivially copyable
266     type __atomic_exchange_relaxed(volatile type* atomic_obj, type desired);
267     type __atomic_exchange_consume(volatile type* atomic_obj, type desired);
268     type __atomic_exchange_acquire(volatile type* atomic_obj, type desired);
269     type __atomic_exchange_release(volatile type* atomic_obj, type desired);
270     type __atomic_exchange_acq_rel(volatile type* atomic_obj, type desired);
271     type __atomic_exchange_seq_cst(volatile type* atomic_obj, type desired);
273     // type must be trivially copyable
274     bool __atomic_compare_exchange_strong_relaxed_relaxed(volatile type* atomic_obj,
275                                                         type* expected,
276                                                         type desired);
277     bool __atomic_compare_exchange_strong_consume_relaxed(volatile type* atomic_obj,
278                                                         type* expected,
279                                                         type desired);
280     bool __atomic_compare_exchange_strong_consume_consume(volatile type* atomic_obj,
281                                                         type* expected,
282                                                         type desired);
283     bool __atomic_compare_exchange_strong_acquire_relaxed(volatile type* atomic_obj,
284                                                         type* expected,
285                                                         type desired);
286     bool __atomic_compare_exchange_strong_acquire_consume(volatile type* atomic_obj,
287                                                         type* expected,
288                                                         type desired);
289     bool __atomic_compare_exchange_strong_acquire_acquire(volatile type* atomic_obj,
290                                                         type* expected,
291                                                         type desired);
292     bool __atomic_compare_exchange_strong_release_relaxed(volatile type* atomic_obj,
293                                                         type* expected,
294                                                         type desired);
295     bool __atomic_compare_exchange_strong_release_consume(volatile type* atomic_obj,
296                                                         type* expected,
297                                                         type desired);
298     bool __atomic_compare_exchange_strong_release_acquire(volatile type* atomic_obj,
299                                                         type* expected,
300                                                         type desired);
301     bool __atomic_compare_exchange_strong_acq_rel_relaxed(volatile type* atomic_obj,
302                                                         type* expected,
303                                                         type desired);
304     bool __atomic_compare_exchange_strong_acq_rel_consume(volatile type* atomic_obj,
305                                                         type* expected,
306                                                         type desired);
307     bool __atomic_compare_exchange_strong_acq_rel_acquire(volatile type* atomic_obj,
308                                                         type* expected,
309                                                         type desired);
310     bool __atomic_compare_exchange_strong_seq_cst_relaxed(volatile type* atomic_obj,
311                                                         type* expected,
312                                                         type desired);
313     bool __atomic_compare_exchange_strong_seq_cst_consume(volatile type* atomic_obj,
314                                                         type* expected,
315                                                         type desired);
316     bool __atomic_compare_exchange_strong_seq_cst_acquire(volatile type* atomic_obj,
317                                                         type* expected,
318                                                         type desired);
319     bool __atomic_compare_exchange_strong_seq_cst_seq_cst(volatile type* atomic_obj,
320                                                         type* expected,
321                                                         type desired);
323     // type must be trivially copyable
324     bool __atomic_compare_exchange_weak_relaxed_relaxed(volatile type* atomic_obj,
325                                                         type* expected,
326                                                         type desired);
327     bool __atomic_compare_exchange_weak_consume_relaxed(volatile type* atomic_obj,
328                                                         type* expected,
329                                                         type desired);
330     bool __atomic_compare_exchange_weak_consume_consume(volatile type* atomic_obj,
331                                                         type* expected,
332                                                         type desired);
333     bool __atomic_compare_exchange_weak_acquire_relaxed(volatile type* atomic_obj,
334                                                         type* expected,
335                                                         type desired);
336     bool __atomic_compare_exchange_weak_acquire_consume(volatile type* atomic_obj,
337                                                         type* expected,
338                                                         type desired);
339     bool __atomic_compare_exchange_weak_acquire_acquire(volatile type* atomic_obj,
340                                                         type* expected,
341                                                         type desired);
342     bool __atomic_compare_exchange_weak_release_relaxed(volatile type* atomic_obj,
343                                                         type* expected,
344                                                         type desired);
345     bool __atomic_compare_exchange_weak_release_consume(volatile type* atomic_obj,
346                                                         type* expected,
347                                                         type desired);
348     bool __atomic_compare_exchange_weak_release_acquire(volatile type* atomic_obj,
349                                                         type* expected,
350                                                         type desired);
351     bool __atomic_compare_exchange_weak_acq_rel_relaxed(volatile type* atomic_obj,
352                                                         type* expected,
353                                                         type desired);
354     bool __atomic_compare_exchange_weak_acq_rel_consume(volatile type* atomic_obj,
355                                                         type* expected,
356                                                         type desired);
357     bool __atomic_compare_exchange_weak_acq_rel_acquire(volatile type* atomic_obj,
358                                                         type* expected,
359                                                         type desired);
360     bool __atomic_compare_exchange_weak_seq_cst_relaxed(volatile type* atomic_obj,
361                                                         type* expected,
362                                                         type desired);
363     bool __atomic_compare_exchange_weak_seq_cst_consume(volatile type* atomic_obj,
364                                                         type* expected,
365                                                         type desired);
366     bool __atomic_compare_exchange_weak_seq_cst_acquire(volatile type* atomic_obj,
367                                                         type* expected,
368                                                         type desired);
369     bool __atomic_compare_exchange_weak_seq_cst_seq_cst(volatile type* atomic_obj,
370                                                         type* expected,
371                                                         type desired);
373     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
374     //      unsigned int, long, unsigned long, long long, unsigned long long,
375     //      char16_t, char32_t, wchar_t
376     type __atomic_fetch_add_relaxed(volatile type* atomic_obj, type operand);
377     type __atomic_fetch_add_consume(volatile type* atomic_obj, type operand);
378     type __atomic_fetch_add_acquire(volatile type* atomic_obj, type operand);
379     type __atomic_fetch_add_release(volatile type* atomic_obj, type operand);
380     type __atomic_fetch_add_acq_rel(volatile type* atomic_obj, type operand);
381     type __atomic_fetch_add_seq_cst(volatile type* atomic_obj, type operand);
383     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
384     //      unsigned int, long, unsigned long, long long, unsigned long long,
385     //      char16_t, char32_t, wchar_t
386     type __atomic_fetch_sub_relaxed(volatile type* atomic_obj, type operand);
387     type __atomic_fetch_sub_consume(volatile type* atomic_obj, type operand);
388     type __atomic_fetch_sub_acquire(volatile type* atomic_obj, type operand);
389     type __atomic_fetch_sub_release(volatile type* atomic_obj, type operand);
390     type __atomic_fetch_sub_acq_rel(volatile type* atomic_obj, type operand);
391     type __atomic_fetch_sub_seq_cst(volatile type* atomic_obj, type operand);
393     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
394     //      unsigned int, long, unsigned long, long long, unsigned long long,
395     //      char16_t, char32_t, wchar_t
396     type __atomic_fetch_and_relaxed(volatile type* atomic_obj, type operand);
397     type __atomic_fetch_and_consume(volatile type* atomic_obj, type operand);
398     type __atomic_fetch_and_acquire(volatile type* atomic_obj, type operand);
399     type __atomic_fetch_and_release(volatile type* atomic_obj, type operand);
400     type __atomic_fetch_and_acq_rel(volatile type* atomic_obj, type operand);
401     type __atomic_fetch_and_seq_cst(volatile type* atomic_obj, type operand);
403     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
404     //      unsigned int, long, unsigned long, long long, unsigned long long,
405     //      char16_t, char32_t, wchar_t
406     type __atomic_fetch_or_relaxed(volatile type* atomic_obj, type operand);
407     type __atomic_fetch_or_consume(volatile type* atomic_obj, type operand);
408     type __atomic_fetch_or_acquire(volatile type* atomic_obj, type operand);
409     type __atomic_fetch_or_release(volatile type* atomic_obj, type operand);
410     type __atomic_fetch_or_acq_rel(volatile type* atomic_obj, type operand);
411     type __atomic_fetch_or_seq_cst(volatile type* atomic_obj, type operand);
413     // type is one of: char, signed char, unsigned char, short, unsigned short, int,
414     //      unsigned int, long, unsigned long, long long, unsigned long long,
415     //      char16_t, char32_t, wchar_t
416     type __atomic_fetch_xor_relaxed(volatile type* atomic_obj, type operand);
417     type __atomic_fetch_xor_consume(volatile type* atomic_obj, type operand);
418     type __atomic_fetch_xor_acquire(volatile type* atomic_obj, type operand);
419     type __atomic_fetch_xor_release(volatile type* atomic_obj, type operand);
420     type __atomic_fetch_xor_acq_rel(volatile type* atomic_obj, type operand);
421     type __atomic_fetch_xor_seq_cst(volatile type* atomic_obj, type operand);
423     void* __atomic_fetch_add_relaxed(void* volatile* atomic_obj, ptrdiff_t operand);
424     void* __atomic_fetch_add_consume(void* volatile* atomic_obj, ptrdiff_t operand);
425     void* __atomic_fetch_add_acquire(void* volatile* atomic_obj, ptrdiff_t operand);
426     void* __atomic_fetch_add_release(void* volatile* atomic_obj, ptrdiff_t operand);
427     void* __atomic_fetch_add_acq_rel(void* volatile* atomic_obj, ptrdiff_t operand);
428     void* __atomic_fetch_add_seq_cst(void* volatile* atomic_obj, ptrdiff_t operand);
430     void* __atomic_fetch_sub_relaxed(void* volatile* atomic_obj, ptrdiff_t operand);
431     void* __atomic_fetch_sub_consume(void* volatile* atomic_obj, ptrdiff_t operand);
432     void* __atomic_fetch_sub_acquire(void* volatile* atomic_obj, ptrdiff_t operand);
433     void* __atomic_fetch_sub_release(void* volatile* atomic_obj, ptrdiff_t operand);
434     void* __atomic_fetch_sub_acq_rel(void* volatile* atomic_obj, ptrdiff_t operand);
435     void* __atomic_fetch_sub_seq_cst(void* volatile* atomic_obj, ptrdiff_t operand);
437     void __atomic_thread_fence_relaxed();
438     void __atomic_thread_fence_consume();
439     void __atomic_thread_fence_acquire();
440     void __atomic_thread_fence_release();
441     void __atomic_thread_fence_acq_rel();
442     void __atomic_thread_fence_seq_cst();
444     void __atomic_signal_fence_relaxed();
445     void __atomic_signal_fence_consume();
446     void __atomic_signal_fence_acquire();
447     void __atomic_signal_fence_release();
448     void __atomic_signal_fence_acq_rel();
449     void __atomic_signal_fence_seq_cst();
451 Design C: Minimal work for the front end
452 ========================================
453 The ``<atomic>`` header is one of the most closely coupled headers to the compiler.
454 Ideally when you invoke any function from ``<atomic>``, it should result in highly
455 optimized assembly being inserted directly into your application -- assembly that
456 is not otherwise representable by higher level C or C++ expressions. The design of
457 the libc++ ``<atomic>`` header started with this goal in mind. A secondary, but
458 still very important goal is that the compiler should have to do minimal work to
459 facilitate the implementation of ``<atomic>``.  Without this second goal, then
460 practically speaking, the libc++ ``<atomic>`` header would be doomed to be a
461 barely supported, second class citizen on almost every platform.
463 Goals:
465 - Optimal code generation for atomic operations
466 - Minimal effort for the compiler to achieve goal 1 on any given platform
467 - Conformance to the C++0X draft standard
469 The purpose of this document is to inform compiler writers what they need to do
470 to enable a high performance libc++ ``<atomic>`` with minimal effort.
472 The minimal work that must be done for a conforming ``<atomic>``
473 ----------------------------------------------------------------
474 The only "atomic" operations that must actually be lock free in
475 ``<atomic>`` are represented by the following compiler intrinsics:
477 .. code-block:: cpp
479     __atomic_flag__ __atomic_exchange_seq_cst(__atomic_flag__ volatile* obj, __atomic_flag__ desr) {
480         unique_lock<mutex> _(some_mutex);
481         __atomic_flag__ result = *obj;
482         *obj = desr;
483         return result;
484     }
486     void __atomic_store_seq_cst(__atomic_flag__ volatile* obj, __atomic_flag__ desr) {
487         unique_lock<mutex> _(some_mutex);
488         *obj = desr;
489     }
491 Where:
493 - If ``__has_feature(__atomic_flag)`` evaluates to 1 in the preprocessor then
494   the compiler must define ``__atomic_flag__`` (e.g. as a typedef to ``int``).
495 - If ``__has_feature(__atomic_flag)`` evaluates to 0 in the preprocessor then
496   the library defines ``__atomic_flag__`` as a typedef to ``bool``.
497 - To communicate that the above intrinsics are available, the compiler must
498   arrange for ``__has_feature`` to return 1 when fed the intrinsic name
499   appended with an '_' and the mangled type name of ``__atomic_flag__``.
501 For example if ``__atomic_flag__`` is ``unsigned int``:
503 .. code-block:: cpp
505     // __has_feature(__atomic_flag) == 1
506     // __has_feature(__atomic_exchange_seq_cst_j) == 1
507     // __has_feature(__atomic_store_seq_cst_j) == 1
509     typedef unsigned int __atomic_flag__;
511     unsigned int __atomic_exchange_seq_cst(unsigned int volatile*, unsigned int) {
512         // ...
513     }
515     void __atomic_store_seq_cst(unsigned int volatile*, unsigned int) {
516         // ...
517     }
519 That's it! Compiler writers do the above and you've got a fully conforming
520 (though sub-par performance) ``<atomic>`` header!
523 Recommended work for a higher performance ``<atomic>``
524 ------------------------------------------------------
525 It would be good if the above intrinsics worked with all integral types plus
526 ``void*``. Because this may not be possible to do in a lock-free manner for
527 all integral types on all platforms, a compiler must communicate each type that
528 an intrinsic works with. For example, if ``__atomic_exchange_seq_cst`` works
529 for all types except for ``long long`` and ``unsigned long long`` then:
531 .. code-block:: cpp
533     __has_feature(__atomic_exchange_seq_cst_b) == 1  // bool
534     __has_feature(__atomic_exchange_seq_cst_c) == 1  // char
535     __has_feature(__atomic_exchange_seq_cst_a) == 1  // signed char
536     __has_feature(__atomic_exchange_seq_cst_h) == 1  // unsigned char
537     __has_feature(__atomic_exchange_seq_cst_Ds) == 1 // char16_t
538     __has_feature(__atomic_exchange_seq_cst_Di) == 1 // char32_t
539     __has_feature(__atomic_exchange_seq_cst_w) == 1  // wchar_t
540     __has_feature(__atomic_exchange_seq_cst_s) == 1  // short
541     __has_feature(__atomic_exchange_seq_cst_t) == 1  // unsigned short
542     __has_feature(__atomic_exchange_seq_cst_i) == 1  // int
543     __has_feature(__atomic_exchange_seq_cst_j) == 1  // unsigned int
544     __has_feature(__atomic_exchange_seq_cst_l) == 1  // long
545     __has_feature(__atomic_exchange_seq_cst_m) == 1  // unsigned long
546     __has_feature(__atomic_exchange_seq_cst_Pv) == 1 // void*
548 Note that only the ``__has_feature`` flag is decorated with the argument
549 type. The name of the compiler intrinsic is not decorated, but instead works
550 like a C++ overloaded function.
552 Additionally, there are other intrinsics besides ``__atomic_exchange_seq_cst``
553 and ``__atomic_store_seq_cst``. They are optional. But if the compiler can
554 generate faster code than provided by the library, then clients will benefit
555 from the compiler writer's expertise and knowledge of the targeted platform.
557 Below is the complete list of *sequentially consistent* intrinsics, and
558 their library implementations. Template syntax is used to indicate the desired
559 overloading for integral and ``void*`` types. The template does not represent a
560 requirement that the intrinsic operate on **any** type!
562 .. code-block:: cpp
564     // T is one of:
565     // bool, char, signed char, unsigned char, short, unsigned short,
566     // int, unsigned int, long, unsigned long,
567     // long long, unsigned long long, char16_t, char32_t, wchar_t, void*
569     template <class T>
570     T __atomic_load_seq_cst(T const volatile* obj) {
571         unique_lock<mutex> _(some_mutex);
572         return *obj;
573     }
575     template <class T>
576     void __atomic_store_seq_cst(T volatile* obj, T desr) {
577         unique_lock<mutex> _(some_mutex);
578         *obj = desr;
579     }
581     template <class T>
582     T __atomic_exchange_seq_cst(T volatile* obj, T desr) {
583         unique_lock<mutex> _(some_mutex);
584         T r = *obj;
585         *obj = desr;
586         return r;
587     }
589     template <class T>
590     bool __atomic_compare_exchange_strong_seq_cst_seq_cst(T volatile* obj, T* exp, T desr) {
591         unique_lock<mutex> _(some_mutex);
592         if (std::memcmp(const_cast<T*>(obj), exp, sizeof(T)) == 0) {
593             std::memcpy(const_cast<T*>(obj), &desr, sizeof(T));
594             return true;
595         }
596         std::memcpy(exp, const_cast<T*>(obj), sizeof(T));
597         return false;
598     }
600     template <class T>
601     bool __atomic_compare_exchange_weak_seq_cst_seq_cst(T volatile* obj, T* exp, T desr) {
602         unique_lock<mutex> _(some_mutex);
603         if (std::memcmp(const_cast<T*>(obj), exp, sizeof(T)) == 0)
604         {
605             std::memcpy(const_cast<T*>(obj), &desr, sizeof(T));
606             return true;
607         }
608         std::memcpy(exp, const_cast<T*>(obj), sizeof(T));
609         return false;
610     }
612     // T is one of:
613     // char, signed char, unsigned char, short, unsigned short,
614     // int, unsigned int, long, unsigned long,
615     // long long, unsigned long long, char16_t, char32_t, wchar_t
617     template <class T>
618     T __atomic_fetch_add_seq_cst(T volatile* obj, T operand) {
619         unique_lock<mutex> _(some_mutex);
620         T r = *obj;
621         *obj += operand;
622         return r;
623     }
625     template <class T>
626     T __atomic_fetch_sub_seq_cst(T volatile* obj, T operand) {
627         unique_lock<mutex> _(some_mutex);
628         T r = *obj;
629         *obj -= operand;
630         return r;
631     }
633     template <class T>
634     T __atomic_fetch_and_seq_cst(T volatile* obj, T operand) {
635         unique_lock<mutex> _(some_mutex);
636         T r = *obj;
637         *obj &= operand;
638         return r;
639     }
641     template <class T>
642     T __atomic_fetch_or_seq_cst(T volatile* obj, T operand) {
643         unique_lock<mutex> _(some_mutex);
644         T r = *obj;
645         *obj |= operand;
646         return r;
647     }
649     template <class T>
650     T __atomic_fetch_xor_seq_cst(T volatile* obj, T operand) {
651         unique_lock<mutex> _(some_mutex);
652         T r = *obj;
653         *obj ^= operand;
654         return r;
655     }
657     void* __atomic_fetch_add_seq_cst(void* volatile* obj, ptrdiff_t operand) {
658         unique_lock<mutex> _(some_mutex);
659         void* r = *obj;
660         (char*&)(*obj) += operand;
661         return r;
662     }
664     void* __atomic_fetch_sub_seq_cst(void* volatile* obj, ptrdiff_t operand) {
665         unique_lock<mutex> _(some_mutex);
666         void* r = *obj;
667         (char*&)(*obj) -= operand;
668         return r;
669     }
671     void __atomic_thread_fence_seq_cst() {
672         unique_lock<mutex> _(some_mutex);
673     }
675     void __atomic_signal_fence_seq_cst() {
676         unique_lock<mutex> _(some_mutex);
677     }
679 One should consult the (currently draft) `C++ Standard <https://wg21.link/n3126>`_
680 for the details of the definitions for these operations. For example,
681 ``__atomic_compare_exchange_weak_seq_cst_seq_cst`` is allowed to fail
682 spuriously while ``__atomic_compare_exchange_strong_seq_cst_seq_cst`` is not.
684 If on your platform the lock-free definition of ``__atomic_compare_exchange_weak_seq_cst_seq_cst``
685 would be the same as ``__atomic_compare_exchange_strong_seq_cst_seq_cst``, you may omit the
686 ``__atomic_compare_exchange_weak_seq_cst_seq_cst`` intrinsic without a performance cost. The
687 library will prefer your implementation of ``__atomic_compare_exchange_strong_seq_cst_seq_cst``
688 over its own definition for implementing ``__atomic_compare_exchange_weak_seq_cst_seq_cst``.
689 That is, the library will arrange for ``__atomic_compare_exchange_weak_seq_cst_seq_cst`` to call
690 ``__atomic_compare_exchange_strong_seq_cst_seq_cst`` if you supply an intrinsic for the strong
691 version but not the weak.
693 Taking advantage of weaker memory synchronization
694 -------------------------------------------------
695 So far, all of the intrinsics presented require a **sequentially consistent** memory ordering.
696 That is, no loads or stores can move across the operation (just as if the library had locked
697 that internal mutex). But ``<atomic>`` supports weaker memory ordering operations. In all,
698 there are six memory orderings (listed here from strongest to weakest):
700 .. code-block:: cpp
702     memory_order_seq_cst
703     memory_order_acq_rel
704     memory_order_release
705     memory_order_acquire
706     memory_order_consume
707     memory_order_relaxed
709 (See the `C++ Standard <https://wg21.link/n3126>`_ for the detailed definitions of each of these orderings).
711 On some platforms, the compiler vendor can offer some or even all of the above
712 intrinsics at one or more weaker levels of memory synchronization. This might
713 lead for example to not issuing an ``mfence`` instruction on the x86.
715 If the compiler does not offer any given operation, at any given memory ordering
716 level, the library will automatically attempt to call the next highest memory
717 ordering operation. This continues up to ``seq_cst``, and if that doesn't
718 exist, then the library takes over and does the job with a ``mutex``. This
719 is a compile-time search and selection operation. At run time, the application
720 will only see the few inlined assembly instructions for the selected intrinsic.
722 Each intrinsic is appended with the 7-letter name of the memory ordering it
723 addresses. For example a ``load`` with ``relaxed`` ordering is defined by:
725 .. code-block:: cpp
727     T __atomic_load_relaxed(const volatile T* obj);
729 And announced with:
731 .. code-block:: cpp
733     __has_feature(__atomic_load_relaxed_b) == 1  // bool
734     __has_feature(__atomic_load_relaxed_c) == 1  // char
735     __has_feature(__atomic_load_relaxed_a) == 1  // signed char
736     ...
738 The ``__atomic_compare_exchange_strong(weak)`` intrinsics are parameterized
739 on two memory orderings. The first ordering applies when the operation returns
740 ``true`` and the second ordering applies when the operation returns ``false``.
742 Not every memory ordering is appropriate for every operation. ``exchange``
743 and the ``fetch_XXX`` operations support all 6. But ``load`` only supports
744 ``relaxed``, ``consume``, ``acquire`` and ``seq_cst``. ``store`` only supports
745 ``relaxed``, ``release``, and ``seq_cst``. The ``compare_exchange`` operations
746 support the following 16 combinations out of the possible 36:
748 .. code-block:: cpp
750     relaxed_relaxed
751     consume_relaxed
752     consume_consume
753     acquire_relaxed
754     acquire_consume
755     acquire_acquire
756     release_relaxed
757     release_consume
758     release_acquire
759     acq_rel_relaxed
760     acq_rel_consume
761     acq_rel_acquire
762     seq_cst_relaxed
763     seq_cst_consume
764     seq_cst_acquire
765     seq_cst_seq_cst
767 Again, the compiler supplies intrinsics only for the strongest orderings where
768 it can make a difference. The library takes care of calling the weakest
769 supplied intrinsic that is as strong or stronger than the customer asked for.
771 Note about ABI
772 ==============
773 With any design, the (back end) compiler writer should note that the decision to
774 implement lock-free operations on any given type (or not) is an ABI-binding decision.
775 One can not change from treating a type as not lock free, to lock free (or vice-versa)
776 without breaking your ABI.
778 For example:
780 **TU1.cpp**:
782 .. code-block:: cpp
784     extern atomic<long long> A;
785     int foo() { return A.compare_exchange_strong(w, x); }
788 **TU2.cpp**:
790 .. code-block:: cpp
792     extern atomic<long long> A;
793     void bar() { return A.compare_exchange_strong(y, z); }
795 If only **one** of these calls to ``compare_exchange_strong`` is implemented with
796 mutex-locked code, then that mutex-locked code will not be executed mutually
797 exclusively of the one implemented in a lock-free manner.