portable_atomic/imp/atomic128/
x86_64.rs

1// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3/*
4128-bit atomic implementation on x86_64.
5
6This architecture provides the following 128-bit atomic instructions:
7
8- CMPXCHG16B: CAS (CMPXCHG16B)
9- VMOVDQA: load/store (Intel, AMD, or Zhaoxin CPU with AVX)
10
11Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
12this module and use intrinsics.rs instead.
13
14Refs:
15- x86 and amd64 instruction reference https://www.felixcloutier.com/x86
16- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
17
18Generated asm:
19- x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51
20*/
21
22// TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm
23
24include!("macros.rs");
25
26#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
27#[path = "../fallback/outline_atomics.rs"]
28mod fallback;
29
30#[cfg(not(portable_atomic_no_outline_atomics))]
31#[cfg(not(target_env = "sgx"))]
32#[cfg_attr(
33    not(target_feature = "sse"),
34    cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))
35)]
36#[path = "../detect/x86_64.rs"]
37mod detect;
38
39#[cfg(not(portable_atomic_no_asm))]
40use core::arch::asm;
41use core::sync::atomic::Ordering;
42
43use crate::utils::{Pair, U128};
44
45// Asserts that the function is called in the correct context.
46macro_rules! debug_assert_cmpxchg16b {
47    () => {
48        #[cfg(not(any(
49            target_feature = "cmpxchg16b",
50            portable_atomic_target_feature = "cmpxchg16b",
51        )))]
52        {
53            debug_assert!(detect::detect().has_cmpxchg16b());
54        }
55    };
56}
57#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
58#[cfg(target_feature = "sse")]
59macro_rules! debug_assert_vmovdqa_atomic {
60    () => {{
61        debug_assert_cmpxchg16b!();
62        debug_assert!(detect::detect().has_vmovdqa_atomic());
63    }};
64}
65
66#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
67#[cfg(target_feature = "sse")]
68#[cfg(target_pointer_width = "32")]
69macro_rules! ptr_modifier {
70    () => {
71        ":e"
72    };
73}
74#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
75#[cfg(target_feature = "sse")]
76#[cfg(target_pointer_width = "64")]
77macro_rules! ptr_modifier {
78    () => {
79        ""
80    };
81}
82
83// Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
84// requirements for the currently enabled target features. In the first place,
85// there is no option in the x86 assembly for such case, like Arm .arch_extension,
86// RISC-V .option arch, PowerPC .machine, etc.
87// However, we set target_feature(enable) when available (Rust 1.69+) in case a
88// new codegen backend is added that checks for it in the future, or an option
89// is added to the assembler to check for it.
90#[cfg_attr(
91    not(portable_atomic_no_cmpxchg16b_target_feature),
92    target_feature(enable = "cmpxchg16b")
93)]
94#[inline]
95unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
96    debug_assert!(dst as usize % 16 == 0);
97    debug_assert_cmpxchg16b!();
98
99    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
100    // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
101    // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
102    //
103    // If the value at `dst` (destination operand) and rdx:rax are equal, the
104    // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
105    // `dst` is loaded to rdx:rax.
106    //
107    // The ZF flag is set if the value at `dst` and rdx:rax are equal,
108    // otherwise it is cleared. Other flags are unaffected.
109    //
110    // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
111    unsafe {
112        // cmpxchg16b is always SeqCst.
113        let r: u8;
114        let old = U128 { whole: old };
115        let new = U128 { whole: new };
116        let (prev_lo, prev_hi);
117        macro_rules! cmpxchg16b {
118            ($rdi:tt) => {
119                asm!(
120                    "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
121                    concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
122                    "sete cl",
123                    "mov rbx, {rbx_tmp}", // restore rbx
124                    rbx_tmp = inout(reg) new.pair.lo => _,
125                    in("rcx") new.pair.hi,
126                    inout("rax") old.pair.lo => prev_lo,
127                    inout("rdx") old.pair.hi => prev_hi,
128                    in($rdi) dst,
129                    lateout("cl") r,
130                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
131                    options(nostack),
132                )
133            };
134        }
135        #[cfg(target_pointer_width = "32")]
136        cmpxchg16b!("edi");
137        #[cfg(target_pointer_width = "64")]
138        cmpxchg16b!("rdi");
139        crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test
140        (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0)
141    }
142}
143
144// VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
145// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
146//
147// Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
148//
149// Use cfg(target_feature = "sse") here -- SSE is included in the x86_64
150// baseline and is always available, but the SSE target feature is disabled for
151// use cases such as kernels and firmware that should not use vector registers.
152// So, do not use vector registers unless SSE target feature is enabled.
153// See also https://github.com/rust-lang/rust/blob/1.84.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md.
154#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
155#[cfg(target_feature = "sse")]
156#[target_feature(enable = "avx")]
157#[inline]
158unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
159    debug_assert!(src as usize % 16 == 0);
160    debug_assert_vmovdqa_atomic!();
161
162    // SAFETY: the caller must uphold the safety contract.
163    //
164    // atomic load by vmovdqa is always SeqCst.
165    unsafe {
166        let out: core::arch::x86_64::__m128i;
167        asm!(
168            concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
169            src = in(reg) src,
170            out = out(xmm_reg) out,
171            options(nostack, preserves_flags),
172        );
173        core::mem::transmute(out)
174    }
175}
176#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
177#[cfg(target_feature = "sse")]
178#[target_feature(enable = "avx")]
179#[inline]
180unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
181    debug_assert!(dst as usize % 16 == 0);
182    debug_assert_vmovdqa_atomic!();
183
184    // SAFETY: the caller must uphold the safety contract.
185    unsafe {
186        let val: core::arch::x86_64::__m128i = core::mem::transmute(val);
187        match order {
188            // Relaxed and Release stores are equivalent.
189            Ordering::Relaxed | Ordering::Release => {
190                asm!(
191                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
192                    dst = in(reg) dst,
193                    val = in(xmm_reg) val,
194                    options(nostack, preserves_flags),
195                );
196            }
197            Ordering::SeqCst => {
198                let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit());
199                asm!(
200                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
201                    // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
202                    // - https://github.com/taiki-e/portable-atomic/pull/156
203                    // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc
204                    // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
205                    // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740
206                    // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
207                    concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),
208                    dst = in(reg) dst,
209                    val = in(xmm_reg) val,
210                    p = inout(reg) p.get() => _,
211                    tmp = lateout(reg) _,
212                    options(nostack, preserves_flags),
213                );
214            }
215            _ => unreachable!(),
216        }
217    }
218}
219
220#[cfg(not(all(
221    any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
222    any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
223)))]
224macro_rules! load_store_detect {
225    (
226        vmovdqa = $vmovdqa:ident
227        cmpxchg16b = $cmpxchg16b:ident
228        fallback = $fallback:ident
229    ) => {{
230        let cpuid = detect::detect();
231        #[cfg(not(any(
232            target_feature = "cmpxchg16b",
233            portable_atomic_target_feature = "cmpxchg16b",
234        )))]
235        {
236            // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
237            if cpuid.has_cmpxchg16b() {
238                // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
239                #[cfg(target_feature = "sse")]
240                {
241                    if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b }
242                }
243                #[cfg(not(target_feature = "sse"))]
244                {
245                    $cmpxchg16b
246                }
247            } else {
248                fallback::$fallback
249            }
250        }
251        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
252        {
253            if cpuid.has_vmovdqa_atomic() { $vmovdqa } else { $cmpxchg16b }
254        }
255    }};
256}
257
258#[inline]
259unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
260    // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
261    // SGX doesn't support CPUID.
262    #[cfg(all(
263        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
264        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
265    ))]
266    // SAFETY: the caller must uphold the safety contract.
267    // cfg guarantees that CMPXCHG16B is available at compile-time.
268    unsafe {
269        // cmpxchg16b is always SeqCst.
270        atomic_load_cmpxchg16b(src)
271    }
272    #[cfg(not(all(
273        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
274        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
275    )))]
276    // SAFETY: the caller must uphold the safety contract.
277    unsafe {
278        ifunc!(unsafe fn(src: *mut u128) -> u128 {
279            load_store_detect! {
280                vmovdqa = atomic_load_vmovdqa
281                cmpxchg16b = atomic_load_cmpxchg16b
282                // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
283                fallback = atomic_load_seqcst
284            }
285        })
286    }
287}
288// See cmpxchg16b() for target_feature(enable).
289#[cfg_attr(
290    not(portable_atomic_no_cmpxchg16b_target_feature),
291    target_feature(enable = "cmpxchg16b")
292)]
293#[inline]
294unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
295    debug_assert!(src as usize % 16 == 0);
296    debug_assert_cmpxchg16b!();
297
298    // SAFETY: the caller must guarantee that `src` is valid for both writes and
299    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
300    // cfg guarantees that the CPU supports CMPXCHG16B.
301    //
302    // See cmpxchg16b function for more.
303    //
304    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
305    // omitting the storing of condition flags and avoid use of xchg to handle rbx.
306    unsafe {
307        // cmpxchg16b is always SeqCst.
308        let (out_lo, out_hi);
309        macro_rules! cmpxchg16b {
310            ($rdi:tt) => {
311                asm!(
312                    "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
313                    "xor rbx, rbx", // zeroed rbx
314                    concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
315                    "mov rbx, {rbx_tmp}", // restore rbx
316                    // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
317                    rbx_tmp = out(reg) _,
318                    in("rcx") 0_u64,
319                    inout("rax") 0_u64 => out_lo,
320                    inout("rdx") 0_u64 => out_hi,
321                    in($rdi) src,
322                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
323                    options(nostack),
324                )
325            };
326        }
327        #[cfg(target_pointer_width = "32")]
328        cmpxchg16b!("edi");
329        #[cfg(target_pointer_width = "64")]
330        cmpxchg16b!("rdi");
331        U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
332    }
333}
334
335#[inline]
336unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
337    // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
338    // SGX doesn't support CPUID.
339    #[cfg(all(
340        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
341        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
342    ))]
343    // SAFETY: the caller must uphold the safety contract.
344    // cfg guarantees that CMPXCHG16B is available at compile-time.
345    unsafe {
346        // cmpxchg16b is always SeqCst.
347        let _ = order;
348        atomic_store_cmpxchg16b(dst, val);
349    }
350    #[cfg(not(all(
351        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
352        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
353    )))]
354    // SAFETY: the caller must uphold the safety contract.
355    unsafe {
356        #[cfg(target_feature = "sse")]
357        fn_alias! {
358            #[target_feature(enable = "avx")]
359            unsafe fn(dst: *mut u128, val: u128);
360            // atomic store by vmovdqa has at least release semantics.
361            atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
362            atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
363        }
364        match order {
365            // Relaxed and Release stores are equivalent in all implementations
366            // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
367            // core::arch's cmpxchg16b will never called here.
368            Ordering::Relaxed | Ordering::Release => {
369                ifunc!(unsafe fn(dst: *mut u128, val: u128) {
370                    load_store_detect! {
371                        vmovdqa = atomic_store_vmovdqa_non_seqcst
372                        cmpxchg16b = atomic_store_cmpxchg16b
373                        fallback = atomic_store_non_seqcst
374                    }
375                });
376            }
377            Ordering::SeqCst => {
378                ifunc!(unsafe fn(dst: *mut u128, val: u128) {
379                    load_store_detect! {
380                        vmovdqa = atomic_store_vmovdqa_seqcst
381                        cmpxchg16b = atomic_store_cmpxchg16b
382                        fallback = atomic_store_seqcst
383                    }
384                });
385            }
386            _ => unreachable!(),
387        }
388    }
389}
390// See cmpxchg16b() for target_feature(enable).
391#[cfg_attr(
392    not(portable_atomic_no_cmpxchg16b_target_feature),
393    target_feature(enable = "cmpxchg16b")
394)]
395#[inline]
396unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
397    // SAFETY: the caller must uphold the safety contract.
398    unsafe {
399        // cmpxchg16b is always SeqCst.
400        atomic_swap_cmpxchg16b(dst, val, Ordering::SeqCst);
401    }
402}
403
404#[inline]
405unsafe fn atomic_compare_exchange(
406    dst: *mut u128,
407    old: u128,
408    new: u128,
409    _success: Ordering,
410    _failure: Ordering,
411) -> Result<u128, u128> {
412    #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
413    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
414    // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
415    // and cfg guarantees that CMPXCHG16B is available at compile-time.
416    let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
417    #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
418    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
419    // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
420    let (prev, ok) = unsafe {
421        ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
422            if detect::detect().has_cmpxchg16b() {
423                cmpxchg16b
424            } else {
425                // Use SeqCst because cmpxchg16b is always SeqCst.
426                fallback::atomic_compare_exchange_seqcst
427            }
428        })
429    };
430    if ok { Ok(prev) } else { Err(prev) }
431}
432
433// cmpxchg16b is always strong.
434use self::atomic_compare_exchange as atomic_compare_exchange_weak;
435
436// See cmpxchg16b() for target_feature(enable).
437#[cfg_attr(
438    not(portable_atomic_no_cmpxchg16b_target_feature),
439    target_feature(enable = "cmpxchg16b")
440)]
441#[inline]
442unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
443    debug_assert!(dst as usize % 16 == 0);
444    debug_assert_cmpxchg16b!();
445
446    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
447    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
448    // cfg guarantees that the CPU supports CMPXCHG16B.
449    //
450    // See cmpxchg16b function for more.
451    //
452    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
453    // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
454    //
455    // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
456    unsafe {
457        // cmpxchg16b is always SeqCst.
458        let val = U128 { whole: val };
459        let (mut prev_lo, mut prev_hi);
460        macro_rules! cmpxchg16b {
461            ($rdi:tt) => {
462                asm!(
463                    "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
464                    // This is not single-copy atomic reads, but this is ok because subsequent
465                    // CAS will check for consistency.
466                    //
467                    // This is based on the code generated for the first load in DW RMWs by LLVM.
468                    //
469                    // Note that the C++20 memory model does not allow mixed-sized atomic access,
470                    // so we must use inline assembly to implement this.
471                    // (i.e., byte-wise atomic based on the standard library's atomic types
472                    // cannot be used here).
473                    concat!("mov rax, qword ptr [", $rdi, "]"),
474                    concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
475                    "2:",
476                        concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
477                        "jne 2b",
478                    "mov rbx, {rbx_tmp}", // restore rbx
479                    rbx_tmp = inout(reg) val.pair.lo => _,
480                    in("rcx") val.pair.hi,
481                    out("rax") prev_lo,
482                    out("rdx") prev_hi,
483                    in($rdi) dst,
484                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
485                    options(nostack),
486                )
487            };
488        }
489        #[cfg(target_pointer_width = "32")]
490        cmpxchg16b!("edi");
491        #[cfg(target_pointer_width = "64")]
492        cmpxchg16b!("rdi");
493        U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
494    }
495}
496
497/// Atomic RMW by CAS loop (3 arguments)
498/// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;`
499///
500/// `$op` can use the following registers:
501/// - rsi/r8 pair: val argument (read-only for `$op`)
502/// - rax/rdx pair: previous value loaded (read-only for `$op`)
503/// - rbx/rcx pair: new value that will be stored
504// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
505// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
506macro_rules! atomic_rmw_cas_3 {
507    ($name:ident, $($op:tt)*) => {
508        // See cmpxchg16b() for target_feature(enable).
509        #[cfg_attr(
510            not(portable_atomic_no_cmpxchg16b_target_feature),
511            target_feature(enable = "cmpxchg16b")
512        )]
513        #[inline]
514        unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
515            debug_assert!(dst as usize % 16 == 0);
516            debug_assert_cmpxchg16b!();
517            // SAFETY: the caller must guarantee that `dst` is valid for both writes and
518            // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
519            // cfg guarantees that the CPU supports CMPXCHG16B.
520            //
521            // See cmpxchg16b function for more.
522            unsafe {
523                // cmpxchg16b is always SeqCst.
524                let val = U128 { whole: val };
525                let (mut prev_lo, mut prev_hi);
526                macro_rules! cmpxchg16b {
527                    ($rdi:tt) => {
528                        asm!(
529                            "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
530                            // This is not single-copy atomic reads, but this is ok because subsequent
531                            // CAS will check for consistency.
532                            //
533                            // This is based on the code generated for the first load in DW RMWs by LLVM.
534                            //
535                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
536                            // so we must use inline assembly to implement this.
537                            // (i.e., byte-wise atomic based on the standard library's atomic types
538                            // cannot be used here).
539                            concat!("mov rax, qword ptr [", $rdi, "]"),
540                            concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
541                            "2:",
542                                $($op)*
543                                concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
544                                "jne 2b",
545                            "mov rbx, {rbx_tmp}", // restore rbx
546                            rbx_tmp = out(reg) _,
547                            out("rcx") _,
548                            out("rax") prev_lo,
549                            out("rdx") prev_hi,
550                            in($rdi) dst,
551                            in("rsi") val.pair.lo,
552                            in("r8") val.pair.hi,
553                            // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
554                            options(nostack),
555                        )
556                    };
557                }
558                #[cfg(target_pointer_width = "32")]
559                cmpxchg16b!("edi");
560                #[cfg(target_pointer_width = "64")]
561                cmpxchg16b!("rdi");
562                U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
563            }
564        }
565    };
566}
567/// Atomic RMW by CAS loop (2 arguments)
568/// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
569///
570/// `$op` can use the following registers:
571/// - rax/rdx pair: previous value loaded (read-only for `$op`)
572/// - rbx/rcx pair: new value that will be stored
573// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
574// omitting the storing of condition flags and avoid use of xchg to handle rbx.
575macro_rules! atomic_rmw_cas_2 {
576    ($name:ident, $($op:tt)*) => {
577        // See cmpxchg16b() for target_feature(enable).
578        #[cfg_attr(
579            not(portable_atomic_no_cmpxchg16b_target_feature),
580            target_feature(enable = "cmpxchg16b")
581        )]
582        #[inline]
583        unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
584            debug_assert!(dst as usize % 16 == 0);
585            debug_assert_cmpxchg16b!();
586            // SAFETY: the caller must guarantee that `dst` is valid for both writes and
587            // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
588            // cfg guarantees that the CPU supports CMPXCHG16B.
589            //
590            // See cmpxchg16b function for more.
591            unsafe {
592                // cmpxchg16b is always SeqCst.
593                let (mut prev_lo, mut prev_hi);
594                macro_rules! cmpxchg16b {
595                    ($rdi:tt) => {
596                        asm!(
597                            "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
598                            // This is not single-copy atomic reads, but this is ok because subsequent
599                            // CAS will check for consistency.
600                            //
601                            // This is based on the code generated for the first load in DW RMWs by LLVM.
602                            //
603                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
604                            // so we must use inline assembly to implement this.
605                            // (i.e., byte-wise atomic based on the standard library's atomic types
606                            // cannot be used here).
607                            concat!("mov rax, qword ptr [", $rdi, "]"),
608                            concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
609                            "2:",
610                                $($op)*
611                                concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
612                                "jne 2b",
613                            "mov rbx, {rbx_tmp}", // restore rbx
614                            rbx_tmp = out(reg) _,
615                            out("rcx") _,
616                            out("rax") prev_lo,
617                            out("rdx") prev_hi,
618                            in($rdi) dst,
619                            // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
620                            options(nostack),
621                        )
622                    };
623                }
624                #[cfg(target_pointer_width = "32")]
625                cmpxchg16b!("edi");
626                #[cfg(target_pointer_width = "64")]
627                cmpxchg16b!("rdi");
628                U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
629            }
630        }
631    };
632}
633
634atomic_rmw_cas_3! {
635    atomic_add_cmpxchg16b,
636    "mov rbx, rax",
637    "add rbx, rsi",
638    "mov rcx, rdx",
639    "adc rcx, r8",
640}
641atomic_rmw_cas_3! {
642    atomic_sub_cmpxchg16b,
643    "mov rbx, rax",
644    "sub rbx, rsi",
645    "mov rcx, rdx",
646    "sbb rcx, r8",
647}
648atomic_rmw_cas_3! {
649    atomic_and_cmpxchg16b,
650    "mov rbx, rax",
651    "and rbx, rsi",
652    "mov rcx, rdx",
653    "and rcx, r8",
654}
655atomic_rmw_cas_3! {
656    atomic_nand_cmpxchg16b,
657    "mov rbx, rax",
658    "and rbx, rsi",
659    "not rbx",
660    "mov rcx, rdx",
661    "and rcx, r8",
662    "not rcx",
663}
664atomic_rmw_cas_3! {
665    atomic_or_cmpxchg16b,
666    "mov rbx, rax",
667    "or rbx, rsi",
668    "mov rcx, rdx",
669    "or rcx, r8",
670}
671atomic_rmw_cas_3! {
672    atomic_xor_cmpxchg16b,
673    "mov rbx, rax",
674    "xor rbx, rsi",
675    "mov rcx, rdx",
676    "xor rcx, r8",
677}
678
679atomic_rmw_cas_2! {
680    atomic_not_cmpxchg16b,
681    "mov rbx, rax",
682    "not rbx",
683    "mov rcx, rdx",
684    "not rcx",
685}
686atomic_rmw_cas_2! {
687    atomic_neg_cmpxchg16b,
688    "mov rbx, rax",
689    "neg rbx",
690    "mov rcx, 0",
691    "sbb rcx, rdx",
692}
693
694atomic_rmw_cas_3! {
695    atomic_max_cmpxchg16b,
696    "cmp rsi, rax",
697    "mov rcx, r8",
698    "sbb rcx, rdx",
699    "mov rcx, r8",
700    "cmovl rcx, rdx",
701    "mov rbx, rsi",
702    "cmovl rbx, rax",
703}
704atomic_rmw_cas_3! {
705    atomic_umax_cmpxchg16b,
706    "cmp rsi, rax",
707    "mov rcx, r8",
708    "sbb rcx, rdx",
709    "mov rcx, r8",
710    "cmovb rcx, rdx",
711    "mov rbx, rsi",
712    "cmovb rbx, rax",
713}
714atomic_rmw_cas_3! {
715    atomic_min_cmpxchg16b,
716    "cmp rsi, rax",
717    "mov rcx, r8",
718    "sbb rcx, rdx",
719    "mov rcx, r8",
720    "cmovge rcx, rdx",
721    "mov rbx, rsi",
722    "cmovge rbx, rax",
723}
724atomic_rmw_cas_3! {
725    atomic_umin_cmpxchg16b,
726    "cmp rsi, rax",
727    "mov rcx, r8",
728    "sbb rcx, rdx",
729    "mov rcx, r8",
730    "cmovae rcx, rdx",
731    "mov rbx, rsi",
732    "cmovae rbx, rax",
733}
734
735macro_rules! select_atomic_rmw {
736    (
737        unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
738        cmpxchg16b = $cmpxchg16b_fn:ident;
739        fallback = $seqcst_fallback_fn:ident;
740    ) => {
741        // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn.
742        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
743        use self::$cmpxchg16b_fn as $name;
744        // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available.
745        #[cfg(not(any(
746            target_feature = "cmpxchg16b",
747            portable_atomic_target_feature = "cmpxchg16b",
748        )))]
749        #[inline]
750        unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
751            fn_alias! {
752                // See cmpxchg16b() for target_feature(enable).
753                #[cfg_attr(
754                    not(portable_atomic_no_cmpxchg16b_target_feature),
755                    target_feature(enable = "cmpxchg16b")
756                )]
757                unsafe fn($($arg)*) $(-> $ret_ty)?;
758                // cmpxchg16b is always SeqCst.
759                cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
760            }
761            // SAFETY: the caller must uphold the safety contract.
762            // we only calls cmpxchg16b_fn if cmpxchg16b is available.
763            unsafe {
764                ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
765                    if detect::detect().has_cmpxchg16b() {
766                        cmpxchg16b_seqcst_fn
767                    } else {
768                        // Use SeqCst because cmpxchg16b is always SeqCst.
769                        fallback::$seqcst_fallback_fn
770                    }
771                })
772            }
773        }
774    };
775}
776
777select_atomic_rmw! {
778    unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
779    cmpxchg16b = atomic_swap_cmpxchg16b;
780    fallback = atomic_swap_seqcst;
781}
782select_atomic_rmw! {
783    unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
784    cmpxchg16b = atomic_add_cmpxchg16b;
785    fallback = atomic_add_seqcst;
786}
787select_atomic_rmw! {
788    unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
789    cmpxchg16b = atomic_sub_cmpxchg16b;
790    fallback = atomic_sub_seqcst;
791}
792select_atomic_rmw! {
793    unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
794    cmpxchg16b = atomic_and_cmpxchg16b;
795    fallback = atomic_and_seqcst;
796}
797select_atomic_rmw! {
798    unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
799    cmpxchg16b = atomic_nand_cmpxchg16b;
800    fallback = atomic_nand_seqcst;
801}
802select_atomic_rmw! {
803    unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
804    cmpxchg16b = atomic_or_cmpxchg16b;
805    fallback = atomic_or_seqcst;
806}
807select_atomic_rmw! {
808    unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
809    cmpxchg16b = atomic_xor_cmpxchg16b;
810    fallback = atomic_xor_seqcst;
811}
812select_atomic_rmw! {
813    unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
814    cmpxchg16b = atomic_max_cmpxchg16b;
815    fallback = atomic_max_seqcst;
816}
817select_atomic_rmw! {
818    unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
819    cmpxchg16b = atomic_umax_cmpxchg16b;
820    fallback = atomic_umax_seqcst;
821}
822select_atomic_rmw! {
823    unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
824    cmpxchg16b = atomic_min_cmpxchg16b;
825    fallback = atomic_min_seqcst;
826}
827select_atomic_rmw! {
828    unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
829    cmpxchg16b = atomic_umin_cmpxchg16b;
830    fallback = atomic_umin_seqcst;
831}
832select_atomic_rmw! {
833    unsafe fn atomic_not(dst: *mut u128) -> u128;
834    cmpxchg16b = atomic_not_cmpxchg16b;
835    fallback = atomic_not_seqcst;
836}
837select_atomic_rmw! {
838    unsafe fn atomic_neg(dst: *mut u128) -> u128;
839    cmpxchg16b = atomic_neg_cmpxchg16b;
840    fallback = atomic_neg_seqcst;
841}
842
843#[inline]
844fn is_lock_free() -> bool {
845    #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
846    {
847        // CMPXCHG16B is available at compile-time.
848        true
849    }
850    #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
851    {
852        detect::detect().has_cmpxchg16b()
853    }
854}
855const IS_ALWAYS_LOCK_FREE: bool =
856    cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));
857
858atomic128!(AtomicI128, i128, atomic_max, atomic_min);
859atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
860
861#[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
862#[cfg(test)]
863mod tests {
864    use super::*;
865
866    test_atomic_int!(i128);
867    test_atomic_int!(u128);
868
869    // load/store/swap implementation is not affected by signedness, so it is
870    // enough to test only unsigned types.
871    stress_test!(u128);
872}