; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}

define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_monotonic_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB0_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB0_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB0_1;
; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
    ret i8 %new
}

define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_monotonic_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB1_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB1_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB1_1;
; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
    ret i8 %new
}

define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_monotonic_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB2_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB2_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB2_1;
; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
    ret i8 %new
}

define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_acquire_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB3_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB3_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB3_1;
; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
    ret i8 %new
}

define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_acquire_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB4_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB4_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB4_1;
; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
    ret i8 %new
}

define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_acquire_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB5_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB5_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB5_1;
; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
    ret i8 %new
}

define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_seq_cst_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB6_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB6_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB6_1;
; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
    ret i8 %new
}

define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_seq_cst_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB7_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB7_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB7_1;
; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
    ret i8 %new
}

define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: monotonic_seq_cst_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB8_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB8_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB8_1;
; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
    ret i8 %new
}

define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_monotonic_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB9_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB9_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB9_1;
; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
    ret i8 %new
}

define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_monotonic_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB10_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB10_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB10_1;
; SM90-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
    ret i8 %new
}

define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_monotonic_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB11_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB11_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB11_1;
; SM90-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
    ret i8 %new
}

define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_acquire_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB12_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB12_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB12_1;
; SM90-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
    ret i8 %new
}

define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_acquire_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB13_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB13_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB13_1;
; SM90-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
    ret i8 %new
}

define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_acquire_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB14_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB14_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB14_1;
; SM90-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
    ret i8 %new
}

define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_seq_cst_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB15_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB15_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB15_1;
; SM90-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
    ret i8 %new
}

define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_seq_cst_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB16_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB16_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB16_1;
; SM90-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
    ret i8 %new
}

define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acquire_seq_cst_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB17_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB17_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB17_1;
; SM90-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
    ret i8 %new
}

define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_monotonic_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB18_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB18_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB18_1;
; SM90-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
    ret i8 %new
}

define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_monotonic_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB19_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB19_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB19_1;
; SM90-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
    ret i8 %new
}

define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_monotonic_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB20_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB20_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB20_1;
; SM90-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
    ret i8 %new
}

define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_acquire_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB21_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB21_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB21_1;
; SM90-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
    ret i8 %new
}

define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_acquire_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB22_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB22_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB22_1;
; SM90-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
    ret i8 %new
}

define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_acquire_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB23_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB23_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB23_1;
; SM90-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
    ret i8 %new
}

define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_seq_cst_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB24_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB24_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB24_1;
; SM90-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
    ret i8 %new
}

define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_seq_cst_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB25_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB25_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB25_1;
; SM90-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
    ret i8 %new
}

define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: release_seq_cst_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB26_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB26_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB26_1;
; SM90-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
    ret i8 %new
}

define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_monotonic_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB27_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB27_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB27_1;
; SM90-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
    ret i8 %new
}

define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_monotonic_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB28_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB28_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB28_1;
; SM90-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
    ret i8 %new
}

define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_monotonic_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB29_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB29_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB29_1;
; SM90-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
    ret i8 %new
}

define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_acquire_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB30_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB30_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB30_1;
; SM90-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
    ret i8 %new
}

define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_acquire_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB31_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB31_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB31_1;
; SM90-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
    ret i8 %new
}

define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_acquire_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB32_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB32_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB32_1;
; SM90-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
    ret i8 %new
}

define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_seq_cst_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB33_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB33_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB33_1;
; SM90-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
    ret i8 %new
}

define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_seq_cst_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB34_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB34_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB34_1;
; SM90-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
    ret i8 %new
}

define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: acq_rel_seq_cst_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB35_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB35_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB35_1;
; SM90-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
    ret i8 %new
}

define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_monotonic_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB36_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB36_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB36_1;
; SM90-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
    ret i8 %new
}

define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_monotonic_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB37_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB37_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB37_1;
; SM90-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
    ret i8 %new
}

define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_monotonic_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB38_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB38_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB38_1;
; SM90-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
    ret i8 %new
}

define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_acquire_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB39_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB39_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB39_1;
; SM90-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
    ret i8 %new
}

define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_acquire_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB40_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB40_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB40_1;
; SM90-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
    ret i8 %new
}

define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_acquire_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB41_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB41_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB41_1;
; SM90-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
    ret i8 %new
}

define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_seq_cst_i8_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB42_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB42_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB42_1;
; SM90-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
    ret i8 %new
}

define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_seq_cst_i8_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB43_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB43_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB43_1;
; SM90-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
    ret i8 %new
}

define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-LABEL: seq_cst_seq_cst_i8_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<21>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 255;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    and.b32 %r15, %r14, 255;
; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
; SM90-NEXT:    and.b32 %r20, %r16, %r2;
; SM90-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r17, %r20, %r3;
; SM90-NEXT:    or.b32 %r18, %r20, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
; SM90-NEXT:    @%p1 bra $L__BB44_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB44_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
; SM90-NEXT:    mov.b32 %r20, %r8;
; SM90-NEXT:    @%p2 bra $L__BB44_1;
; SM90-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
    ret i8 %new
}

define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_monotonic_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB45_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB45_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB45_1;
; SM90-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
    ret i16 %new
}

define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_monotonic_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB46_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB46_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB46_1;
; SM90-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
    ret i16 %new
}

define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_monotonic_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB47_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB47_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB47_1;
; SM90-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
    ret i16 %new
}

define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_acquire_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB48_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB48_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB48_1;
; SM90-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
    ret i16 %new
}

define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_acquire_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB49_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB49_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB49_1;
; SM90-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
    ret i16 %new
}

define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_acquire_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB50_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB50_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB50_1;
; SM90-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
    ret i16 %new
}

define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_seq_cst_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB51_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB51_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB51_1;
; SM90-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
    ret i16 %new
}

define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_seq_cst_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB52_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB52_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB52_1;
; SM90-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
    ret i16 %new
}

define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: monotonic_seq_cst_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB53_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB53_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB53_1;
; SM90-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
    ret i16 %new
}

define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_monotonic_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB54_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB54_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB54_1;
; SM90-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
    ret i16 %new
}

define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_monotonic_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB55_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB55_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB55_1;
; SM90-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
    ret i16 %new
}

define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_monotonic_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB56_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB56_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB56_1;
; SM90-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
    ret i16 %new
}

define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_acquire_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB57_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB57_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB57_1;
; SM90-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
    ret i16 %new
}

define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_acquire_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB58_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB58_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB58_1;
; SM90-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
    ret i16 %new
}

define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_acquire_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB59_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB59_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB59_1;
; SM90-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
    ret i16 %new
}

define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_seq_cst_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB60_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB60_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB60_1;
; SM90-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
    ret i16 %new
}

define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_seq_cst_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB61_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB61_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB61_1;
; SM90-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
    ret i16 %new
}

define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acquire_seq_cst_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB62_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB62_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB62_1;
; SM90-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
    ret i16 %new
}

define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_monotonic_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB63_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB63_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB63_1;
; SM90-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
    ret i16 %new
}

define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_monotonic_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB64_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB64_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB64_1;
; SM90-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
    ret i16 %new
}

define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_monotonic_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB65_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB65_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB65_1;
; SM90-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
    ret i16 %new
}

define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_acquire_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB66_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB66_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB66_1;
; SM90-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
    ret i16 %new
}

define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_acquire_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB67_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB67_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB67_1;
; SM90-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
    ret i16 %new
}

define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_acquire_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB68_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB68_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB68_1;
; SM90-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
    ret i16 %new
}

define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_seq_cst_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB69_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB69_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB69_1;
; SM90-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
    ret i16 %new
}

define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_seq_cst_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB70_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB70_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB70_1;
; SM90-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
    ret i16 %new
}

define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: release_seq_cst_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB71_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB71_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB71_1;
; SM90-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
    ret i16 %new
}

define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_monotonic_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB72_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB72_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB72_1;
; SM90-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
    ret i16 %new
}

define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_monotonic_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB73_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB73_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB73_1;
; SM90-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
    ret i16 %new
}

define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_monotonic_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB74_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB74_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB74_1;
; SM90-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
    ret i16 %new
}

define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_acquire_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB75_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB75_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB75_1;
; SM90-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
    ret i16 %new
}

define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_acquire_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB76_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB76_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB76_1;
; SM90-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
    ret i16 %new
}

define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_acquire_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
; SM90-NEXT:    fence.release.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB77_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB77_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB77_1;
; SM90-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
    ret i16 %new
}

define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_seq_cst_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB78_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB78_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB78_1;
; SM90-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
    ret i16 %new
}

define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_seq_cst_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB79_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB79_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB79_1;
; SM90-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
    ret i16 %new
}

define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: acq_rel_seq_cst_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB80_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB80_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB80_1;
; SM90-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
    ret i16 %new
}

define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_monotonic_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB81_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB81_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB81_1;
; SM90-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
    ret i16 %new
}

define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_monotonic_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB82_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB82_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB82_1;
; SM90-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
    ret i16 %new
}

define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_monotonic_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB83_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB83_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB83_1;
; SM90-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
    ret i16 %new
}

define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_acquire_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB84_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB84_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB84_1;
; SM90-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
    ret i16 %new
}

define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_acquire_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB85_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB85_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB85_1;
; SM90-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
    ret i16 %new
}

define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_acquire_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB86_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB86_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB86_1;
; SM90-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
    ret i16 %new
}

define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_seq_cst_i16_generic(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB87_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB87_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB87_1;
; SM90-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
    ret i16 %new
}

define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_seq_cst_i16_global(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB88_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB88_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB88_1;
; SM90-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
    ret i16 %new
}

define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
; SM90-LABEL: seq_cst_seq_cst_i16_shared(
; SM90:       {
; SM90-NEXT:    .reg .pred %p<3>;
; SM90-NEXT:    .reg .b16 %rs<2>;
; SM90-NEXT:    .reg .b32 %r<20>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
; SM90-NEXT:    and.b32 %r11, %r10, 3;
; SM90-NEXT:    shl.b32 %r1, %r11, 3;
; SM90-NEXT:    mov.b32 %r12, 65535;
; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
; SM90-NEXT:    not.b32 %r2, %r13;
; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
; SM90-NEXT:    and.b32 %r19, %r15, %r2;
; SM90-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
; SM90-NEXT:    or.b32 %r16, %r19, %r3;
; SM90-NEXT:    or.b32 %r17, %r19, %r4;
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
; SM90-NEXT:    @%p1 bra $L__BB89_3;
; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT:    // in Loop: Header=BB89_1 Depth=1
; SM90-NEXT:    and.b32 %r8, %r7, %r2;
; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
; SM90-NEXT:    mov.b32 %r19, %r8;
; SM90-NEXT:    @%p2 bra $L__BB89_1;
; SM90-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
; SM90-NEXT:    fence.acquire.sys;
; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
    ret i16 %new
}

define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_monotonic_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
; SM90-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
    ret i32 %new
}

define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_monotonic_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
; SM90-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
    ret i32 %new
}

define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_monotonic_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
    ret i32 %new
}

define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_acquire_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
    ret i32 %new
}

define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_acquire_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
    ret i32 %new
}

define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_acquire_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
    ret i32 %new
}

define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_seq_cst_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
    ret i32 %new
}

define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_seq_cst_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
    ret i32 %new
}

define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: monotonic_seq_cst_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
    ret i32 %new
}

define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_monotonic_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
    ret i32 %new
}

define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_monotonic_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
    ret i32 %new
}

define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_monotonic_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
    ret i32 %new
}

define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_acquire_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
    ret i32 %new
}

define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_acquire_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
    ret i32 %new
}

define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_acquire_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
    ret i32 %new
}

define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_seq_cst_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
    ret i32 %new
}

define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_seq_cst_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
    ret i32 %new
}

define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acquire_seq_cst_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
    ret i32 %new
}

define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_monotonic_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
    ret i32 %new
}

define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_monotonic_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
; SM90-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
    ret i32 %new
}

define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_monotonic_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
; SM90-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
    ret i32 %new
}

define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_acquire_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
    ret i32 %new
}

define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_acquire_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
    ret i32 %new
}

define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_acquire_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
    ret i32 %new
}

define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_seq_cst_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
    ret i32 %new
}

define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_seq_cst_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
    ret i32 %new
}

define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: release_seq_cst_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
    ret i32 %new
}

define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_monotonic_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
    ret i32 %new
}

define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_monotonic_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
    ret i32 %new
}

define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_monotonic_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
    ret i32 %new
}

define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_acquire_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
    ret i32 %new
}

define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_acquire_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
    ret i32 %new
}

define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_acquire_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
    ret i32 %new
}

define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_seq_cst_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
    ret i32 %new
}

define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_seq_cst_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
    ret i32 %new
}

define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: acq_rel_seq_cst_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
    ret i32 %new
}

define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_monotonic_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
    ret i32 %new
}

define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_monotonic_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
    ret i32 %new
}

define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_monotonic_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
    ret i32 %new
}

define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_acquire_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
    ret i32 %new
}

define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_acquire_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
    ret i32 %new
}

define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_acquire_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
    ret i32 %new
}

define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_seq_cst_i32_generic(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
    ret i32 %new
}

define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_seq_cst_i32_global(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
    ret i32 %new
}

define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
; SM90-LABEL: seq_cst_seq_cst_i32_shared(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<4>;
; SM90-NEXT:    .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
    ret i32 %new
}

define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_monotonic_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
; SM90-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
    ret i64 %new
}

define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_monotonic_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
; SM90-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
    ret i64 %new
}

define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_monotonic_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
; SM90-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
    ret i64 %new
}

define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_acquire_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
    ret i64 %new
}

define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_acquire_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
    ret i64 %new
}

define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_acquire_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
    ret i64 %new
}

define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_seq_cst_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
    ret i64 %new
}

define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_seq_cst_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
    ret i64 %new
}

define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: monotonic_seq_cst_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
    ret i64 %new
}

define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_monotonic_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
    ret i64 %new
}

define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_monotonic_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
    ret i64 %new
}

define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_monotonic_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
    ret i64 %new
}

define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_acquire_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
    ret i64 %new
}

define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_acquire_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
    ret i64 %new
}

define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_acquire_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
    ret i64 %new
}

define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_seq_cst_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
    ret i64 %new
}

define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_seq_cst_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
    ret i64 %new
}

define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acquire_seq_cst_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
    ret i64 %new
}

define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_monotonic_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
    ret i64 %new
}

define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_monotonic_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
; SM90-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
    ret i64 %new
}

define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_monotonic_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
; SM90-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
    ret i64 %new
}

define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_acquire_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
    ret i64 %new
}

define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_acquire_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
    ret i64 %new
}

define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_acquire_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
    ret i64 %new
}

define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_seq_cst_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
    ret i64 %new
}

define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_seq_cst_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
    ret i64 %new
}

define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: release_seq_cst_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
    ret i64 %new
}

define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_monotonic_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
    ret i64 %new
}

define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_monotonic_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
    ret i64 %new
}

define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_monotonic_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
    ret i64 %new
}

define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_acquire_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
    ret i64 %new
}

define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_acquire_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
    ret i64 %new
}

define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_acquire_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
    ret i64 %new
}

define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_seq_cst_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
    ret i64 %new
}

define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_seq_cst_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
    ret i64 %new
}

define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: acq_rel_seq_cst_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
    ret i64 %new
}

define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_monotonic_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
    ret i64 %new
}

define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_monotonic_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
    ret i64 %new
}

define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_monotonic_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
    ret i64 %new
}

define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_acquire_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
    ret i64 %new
}

define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_acquire_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
    ret i64 %new
}

define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_acquire_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
    ret i64 %new
}

define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_seq_cst_i64_generic(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
    ret i64 %new
}

define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_seq_cst_i64_global(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
    ret i64 %new
}

define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
; SM90-LABEL: seq_cst_seq_cst_i64_shared(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
; SM90-NEXT:    fence.sc.sys;
; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
; SM90-NEXT:    ret;
    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
    ret i64 %new
}

