test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll

   1 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
   2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
   3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s
   4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s
   5
   6 ; GCN-LABEL: {{^}}system_one_as_monotonic_monotonic:
   7 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
   8 ; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
   9 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
  10 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
  11 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
  12 ; GFX8-NOT:  buffer_wbinvl1_vol
  13 ; GFX10-NOT: buffer_gl{{[01]}}_inv
  14 ; GFX10:         .amdhsa_kernel system_one_as_monotonic_monotonic
  15 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
  16 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
  17 ; GFX10-NOT:     .amdhsa_memory_ordered 0
  18 define amdgpu_kernel void @system_one_as_monotonic_monotonic(
  19     i32* %out, i32 %in, i32 %old) {
  20 entry:
  21   %gep = getelementptr i32, i32* %out, i32 4
  22   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
  23   ret void
  24 }
  25
  26 ; GCN-LABEL: {{^}}system_one_as_acquire_monotonic:
  27 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
  28 ; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
  29 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
  30 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
  31 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
  32 ; GFX8-NEXT:  buffer_wbinvl1_vol
  33 ; GFX10-NEXT: buffer_gl0_inv
  34 ; GFX10-NEXT: buffer_gl1_inv
  35 ; GFX10:         .amdhsa_kernel system_one_as_acquire_monotonic
  36 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
  37 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
  38 ; GFX10-NOT:     .amdhsa_memory_ordered 0
  39 define amdgpu_kernel void @system_one_as_acquire_monotonic(
  40     i32* %out, i32 %in, i32 %old) {
  41 entry:
  42   %gep = getelementptr i32, i32* %out, i32 4
  43   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
  44   ret void
  45 }
  46
  47 ; GCN-LABEL: {{^}}system_one_as_release_monotonic:
  48 ; GCN:        s_waitcnt vmcnt(0){{$}}
  49 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
  50 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
  51 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
  52 ; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
  53 ; GFX8-NOT:   buffer_wbinvl1_vol
  54 ; GFX10-NOT:  buffer_gl._inv
  55 ; GFX10:         .amdhsa_kernel system_one_as_release_monotonic
  56 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
  57 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
  58 ; GFX10-NOT:     .amdhsa_memory_ordered 0
  59 define amdgpu_kernel void @system_one_as_release_monotonic(
  60     i32* %out, i32 %in, i32 %old) {
  61 entry:
  62   %gep = getelementptr i32, i32* %out, i32 4
  63   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
  64   ret void
  65 }
  66
  67 ; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic:
  68 ; GCN:        s_waitcnt vmcnt(0){{$}}
  69 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
  70 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
  71 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
  72 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
  73 ; GFX8-NEXT:  buffer_wbinvl1_vol
  74 ; GFX10-NEXT: buffer_gl0_inv
  75 ; GFX10-NEXT: buffer_gl1_inv
  76 ; GFX10:         .amdhsa_kernel system_one_as_acq_rel_monotonic
  77 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
  78 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
  79 ; GFX10-NOT:     .amdhsa_memory_ordered 0
  80 define amdgpu_kernel void @system_one_as_acq_rel_monotonic(
  81     i32* %out, i32 %in, i32 %old) {
  82 entry:
  83   %gep = getelementptr i32, i32* %out, i32 4
  84   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
  85   ret void
  86 }
  87
  88 ; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic:
  89 ; GCN:        s_waitcnt vmcnt(0){{$}}
  90 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
  91 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
  92 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
  93 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
  94 ; GFX8-NEXT:  buffer_wbinvl1_vol
  95 ; GFX10-NEXT: buffer_gl0_inv
  96 ; GFX10-NEXT: buffer_gl1_inv
  97 ; GFX10:         .amdhsa_kernel system_one_as_seq_cst_monotonic
  98 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
  99 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 100 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 101 define amdgpu_kernel void @system_one_as_seq_cst_monotonic(
 102     i32* %out, i32 %in, i32 %old) {
 103 entry:
 104   %gep = getelementptr i32, i32* %out, i32 4
 105   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
 106   ret void
 107 }
 108
 109 ; GCN-LABEL: {{^}}system_one_as_acquire_acquire:
 110 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
 111 ; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 112 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 113 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 114 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 115 ; GFX8-NEXT:  buffer_wbinvl1_vol
 116 ; GFX10-NEXT: buffer_gl0_inv
 117 ; GFX10-NEXT: buffer_gl1_inv
 118 ; GFX10:         .amdhsa_kernel system_one_as_acquire_acquire
 119 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 120 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 121 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 122 define amdgpu_kernel void @system_one_as_acquire_acquire(
 123     i32* %out, i32 %in, i32 %old) {
 124 entry:
 125   %gep = getelementptr i32, i32* %out, i32 4
 126   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
 127   ret void
 128 }
 129
 130 ; GCN-LABEL: {{^}}system_one_as_release_acquire:
 131 ; GCN:        s_waitcnt vmcnt(0){{$}}
 132 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 133 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 134 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 135 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 136 ; GFX8-NEXT:  buffer_wbinvl1_vol
 137 ; GFX10-NEXT: buffer_gl0_inv
 138 ; GFX10-NEXT: buffer_gl1_inv
 139 ; GFX10:         .amdhsa_kernel system_one_as_release_acquire
 140 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 141 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 142 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 143 define amdgpu_kernel void @system_one_as_release_acquire(
 144     i32* %out, i32 %in, i32 %old) {
 145 entry:
 146   %gep = getelementptr i32, i32* %out, i32 4
 147   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
 148   ret void
 149 }
 150
 151 ; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire:
 152 ; GCN:        s_waitcnt vmcnt(0){{$}}
 153 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 154 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 155 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 156 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 157 ; GFX8-NEXT:  buffer_wbinvl1_vol
 158 ; GFX10-NEXT: buffer_gl0_inv
 159 ; GFX10-NEXT: buffer_gl1_inv
 160 ; GFX10:         .amdhsa_kernel system_one_as_acq_rel_acquire
 161 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 162 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 163 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 164 define amdgpu_kernel void @system_one_as_acq_rel_acquire(
 165     i32* %out, i32 %in, i32 %old) {
 166 entry:
 167   %gep = getelementptr i32, i32* %out, i32 4
 168   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
 169   ret void
 170 }
 171
 172 ; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire:
 173 ; GCN:        s_waitcnt vmcnt(0){{$}}
 174 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 175 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 176 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 177 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 178 ; GFX8-NEXT:  buffer_wbinvl1_vol
 179 ; GFX10-NEXT: buffer_gl0_inv
 180 ; GFX10-NEXT: buffer_gl1_inv
 181 ; GFX10:         .amdhsa_kernel system_one_as_seq_cst_acquire
 182 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 183 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 184 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 185 define amdgpu_kernel void @system_one_as_seq_cst_acquire(
 186     i32* %out, i32 %in, i32 %old) {
 187 entry:
 188   %gep = getelementptr i32, i32* %out, i32 4
 189   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
 190   ret void
 191 }
 192
 193 ; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst:
 194 ; GCN:        s_waitcnt vmcnt(0){{$}}
 195 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 196 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 197 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 198 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 199 ; GFX8-NEXT:  buffer_wbinvl1_vol
 200 ; GFX10-NEXT: buffer_gl0_inv
 201 ; GFX10-NEXT: buffer_gl1_inv
 202 ; GFX10:         .amdhsa_kernel system_one_as_seq_cst_seq_cst
 203 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 204 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 205 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 206 define amdgpu_kernel void @system_one_as_seq_cst_seq_cst(
 207     i32* %out, i32 %in, i32 %old) {
 208 entry:
 209   %gep = getelementptr i32, i32* %out, i32 4
 210   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
 211   ret void
 212 }
 213
 214 ; GCN-LABEL: {{^}}singlethread_one_as_monotonic_monotonic:
 215 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 216 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 217 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 218 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 219 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 220 ; GFX8-NOT:  buffer_wbinvl1_vol
 221 ; GFX10-NOT: buffer_gl{{[01]}}_inv
 222 ; GFX10:         .amdhsa_kernel singlethread_one_as_monotonic_monotonic
 223 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 224 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 225 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 226 define amdgpu_kernel void @singlethread_one_as_monotonic_monotonic(
 227     i32* %out, i32 %in, i32 %old) {
 228 entry:
 229   %gep = getelementptr i32, i32* %out, i32 4
 230   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
 231   ret void
 232 }
 233
 234 ; GCN-LABEL: {{^}}singlethread_one_as_acquire_monotonic:
 235 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 236 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 237 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 238 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 239 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 240 ; GFX8-NOT:  buffer_wbinvl1_vol
 241 ; GFX10-NOT: buffer_gl{{[01]}}_inv
 242 ; GFX10:         .amdhsa_kernel singlethread_one_as_acquire_monotonic
 243 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 244 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 245 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 246 define amdgpu_kernel void @singlethread_one_as_acquire_monotonic(
 247     i32* %out, i32 %in, i32 %old) {
 248 entry:
 249   %gep = getelementptr i32, i32* %out, i32 4
 250   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
 251   ret void
 252 }
 253
 254 ; GCN-LABEL: {{^}}singlethread_one_as_release_monotonic:
 255 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 256 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 257 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 258 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 259 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 260 ; Gfx8-NOT:  buffer_wbinvl1_vol
 261 ; GCN-NOT:   buffer_gl{{[01]}}_inv
 262 ; GFX10:         .amdhsa_kernel singlethread_one_as_release_monotonic
 263 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 264 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 265 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 266 define amdgpu_kernel void @singlethread_one_as_release_monotonic(
 267     i32* %out, i32 %in, i32 %old) {
 268 entry:
 269   %gep = getelementptr i32, i32* %out, i32 4
 270   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
 271   ret void
 272 }
 273
 274 ; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_monotonic:
 275 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 276 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 277 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 278 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 279 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 280 ; GFX8-NOT:  buffer_wbinvl1_vol
 281 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 282 ; GFX10:         .amdhsa_kernel singlethread_one_as_acq_rel_monotonic
 283 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 284 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 285 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 286 define amdgpu_kernel void @singlethread_one_as_acq_rel_monotonic(
 287     i32* %out, i32 %in, i32 %old) {
 288 entry:
 289   %gep = getelementptr i32, i32* %out, i32 4
 290   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
 291   ret void
 292 }
 293
 294 ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_monotonic:
 295 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 296 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 297 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 298 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 299 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 300 ; GFX8-NOT:  buffer_wbinvl1_vol
 301 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 302 ; GFX10:         .amdhsa_kernel singlethread_one_as_seq_cst_monotonic
 303 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 304 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 305 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 306 define amdgpu_kernel void @singlethread_one_as_seq_cst_monotonic(
 307     i32* %out, i32 %in, i32 %old) {
 308 entry:
 309   %gep = getelementptr i32, i32* %out, i32 4
 310   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
 311   ret void
 312 }
 313
 314 ; GCN-LABEL: {{^}}singlethread_one_as_acquire_acquire:
 315 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 316 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 317 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 318 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 319 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 320 ; GFX8-NOT:  buffer_wbinvl1_vol
 321 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 322 ; GFX10:         .amdhsa_kernel singlethread_one_as_acquire_acquire
 323 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 324 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 325 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 326 define amdgpu_kernel void @singlethread_one_as_acquire_acquire(
 327     i32* %out, i32 %in, i32 %old) {
 328 entry:
 329   %gep = getelementptr i32, i32* %out, i32 4
 330   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
 331   ret void
 332 }
 333
 334 ; GCN-LABEL: {{^}}singlethread_one_as_release_acquire:
 335 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 336 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 337 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 338 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 339 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 340 ; GFX8-NOT:  buffer_wbinvl1_vol
 341 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 342 ; GFX10:         .amdhsa_kernel singlethread_one_as_release_acquire
 343 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 344 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 345 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 346 define amdgpu_kernel void @singlethread_one_as_release_acquire(
 347     i32* %out, i32 %in, i32 %old) {
 348 entry:
 349   %gep = getelementptr i32, i32* %out, i32 4
 350   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
 351   ret void
 352 }
 353
 354 ; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_acquire:
 355 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 356 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 357 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 358 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 359 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 360 ; GFX8-NOT:  buffer_wbinvl1_vol
 361 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 362 ; GFX10:         .amdhsa_kernel singlethread_one_as_acq_rel_acquire
 363 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 364 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 365 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 366 define amdgpu_kernel void @singlethread_one_as_acq_rel_acquire(
 367     i32* %out, i32 %in, i32 %old) {
 368 entry:
 369   %gep = getelementptr i32, i32* %out, i32 4
 370   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
 371   ret void
 372 }
 373
 374 ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_acquire:
 375 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 376 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 377 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 378 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 379 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 380 ; GFX8-NOT:  buffer_wbinvl1_vol
 381 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 382 ; GFX10:         .amdhsa_kernel singlethread_one_as_seq_cst_acquire
 383 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 384 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 385 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 386 define amdgpu_kernel void @singlethread_one_as_seq_cst_acquire(
 387     i32* %out, i32 %in, i32 %old) {
 388 entry:
 389   %gep = getelementptr i32, i32* %out, i32 4
 390   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
 391   ret void
 392 }
 393
 394 ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_seq_cst:
 395 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 396 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 397 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 398 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 399 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 400 ; GFX8-NOT:  buffer_wbinvl1_vol
 401 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 402 ; GFX10:         .amdhsa_kernel singlethread_one_as_seq_cst_seq_cst
 403 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 404 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 405 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 406 define amdgpu_kernel void @singlethread_one_as_seq_cst_seq_cst(
 407     i32* %out, i32 %in, i32 %old) {
 408 entry:
 409   %gep = getelementptr i32, i32* %out, i32 4
 410   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
 411   ret void
 412 }
 413
 414 ; GCN-LABEL: {{^}}agent_one_as_monotonic_monotonic:
 415 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 416 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 417 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 418 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 419 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 420 ; GFX8-NOT:  buffer_wbinvl1_vol
 421 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 422 ; GFX10:         .amdhsa_kernel agent_one_as_monotonic_monotonic
 423 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 424 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 425 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 426 define amdgpu_kernel void @agent_one_as_monotonic_monotonic(
 427     i32* %out, i32 %in, i32 %old) {
 428 entry:
 429   %gep = getelementptr i32, i32* %out, i32 4
 430   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
 431   ret void
 432 }
 433
 434 ; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic:
 435 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
 436 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 437 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 438 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 439 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 440 ; GFX8-NEXT:  buffer_wbinvl1_vol
 441 ; GFX10-NEXT: buffer_gl0_inv
 442 ; GFX10-NEXT: buffer_gl1_inv
 443 ; GFX10:         .amdhsa_kernel agent_one_as_acquire_monotonic
 444 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 445 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 446 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 447 define amdgpu_kernel void @agent_one_as_acquire_monotonic(
 448     i32* %out, i32 %in, i32 %old) {
 449 entry:
 450   %gep = getelementptr i32, i32* %out, i32 4
 451   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
 452   ret void
 453 }
 454
 455 ; GCN-LABEL: {{^}}agent_one_as_release_monotonic:
 456 ; GCN:        s_waitcnt vmcnt(0){{$}}
 457 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 458 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 459 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
 460 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 461 ; GCN-NOT:    buffer_{{wbinvl1_vol|gl._inv}}
 462 ; GFX10:         .amdhsa_kernel agent_one_as_release_monotonic
 463 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 464 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 465 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 466 define amdgpu_kernel void @agent_one_as_release_monotonic(
 467     i32* %out, i32 %in, i32 %old) {
 468 entry:
 469   %gep = getelementptr i32, i32* %out, i32 4
 470   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
 471   ret void
 472 }
 473
 474 ; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic:
 475 ; GCN:        s_waitcnt vmcnt(0){{$}}
 476 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 477 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 478 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 479 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 480 ; GFX8-NEXT:  buffer_wbinvl1_vol
 481 ; GFX10-NEXT: buffer_gl0_inv
 482 ; GFX10-NEXT: buffer_gl1_inv
 483 ; GFX10:         .amdhsa_kernel agent_one_as_acq_rel_monotonic
 484 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 485 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 486 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 487 define amdgpu_kernel void @agent_one_as_acq_rel_monotonic(
 488     i32* %out, i32 %in, i32 %old) {
 489 entry:
 490   %gep = getelementptr i32, i32* %out, i32 4
 491   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
 492   ret void
 493 }
 494
 495 ; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic:
 496 ; GCN:        s_waitcnt vmcnt(0){{$}}
 497 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 498 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 499 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 500 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 501 ; GFX8-NEXT:  buffer_wbinvl1_vol
 502 ; GFX10-NEXT: buffer_gl0_inv
 503 ; GFX10-NEXT: buffer_gl1_inv
 504 ; GFX10:         .amdhsa_kernel agent_one_as_seq_cst_monotonic
 505 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 506 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 507 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 508 define amdgpu_kernel void @agent_one_as_seq_cst_monotonic(
 509     i32* %out, i32 %in, i32 %old) {
 510 entry:
 511   %gep = getelementptr i32, i32* %out, i32 4
 512   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
 513   ret void
 514 }
 515
 516 ; GCN-LABEL: {{^}}agent_one_as_acquire_acquire:
 517 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
 518 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 519 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 520 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 521 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 522 ; GFX8-NEXT:  buffer_wbinvl1_vol
 523 ; GFX10-NEXT: buffer_gl0_inv
 524 ; GFX10-NEXT: buffer_gl1_inv
 525 ; GFX10:         .amdhsa_kernel agent_one_as_acquire_acquire
 526 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 527 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 528 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 529 define amdgpu_kernel void @agent_one_as_acquire_acquire(
 530     i32* %out, i32 %in, i32 %old) {
 531 entry:
 532   %gep = getelementptr i32, i32* %out, i32 4
 533   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
 534   ret void
 535 }
 536
 537 ; GCN-LABEL: {{^}}agent_one_as_release_acquire:
 538 ; GCN:        s_waitcnt vmcnt(0){{$}}
 539 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 540 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 541 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 542 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 543 ; GFX8-NEXT:  buffer_wbinvl1_vol
 544 ; GFX10-NEXT: buffer_gl0_inv
 545 ; GFX10-NEXT: buffer_gl1_inv
 546 ; GFX10:         .amdhsa_kernel agent_one_as_release_acquire
 547 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 548 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 549 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 550 define amdgpu_kernel void @agent_one_as_release_acquire(
 551     i32* %out, i32 %in, i32 %old) {
 552 entry:
 553   %gep = getelementptr i32, i32* %out, i32 4
 554   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
 555   ret void
 556 }
 557
 558 ; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire:
 559 ; GCN:        s_waitcnt vmcnt(0){{$}}
 560 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 561 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 562 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 563 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 564 ; GFX8-NEXT:  buffer_wbinvl1_vol
 565 ; GFX10-NEXT: buffer_gl0_inv
 566 ; GFX10-NEXT: buffer_gl1_inv
 567 ; GFX10:         .amdhsa_kernel agent_one_as_acq_rel_acquire
 568 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 569 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 570 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 571 define amdgpu_kernel void @agent_one_as_acq_rel_acquire(
 572     i32* %out, i32 %in, i32 %old) {
 573 entry:
 574   %gep = getelementptr i32, i32* %out, i32 4
 575   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
 576   ret void
 577 }
 578
 579 ; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire:
 580 ; GCN:        s_waitcnt vmcnt(0){{$}}
 581 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 582 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 583 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 584 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 585 ; GFX8-NEXT:  buffer_wbinvl1_vol
 586 ; GFX10-NEXT: buffer_gl0_inv
 587 ; GFX10-NEXT: buffer_gl1_inv
 588 ; GFX10:         .amdhsa_kernel agent_one_as_seq_cst_acquire
 589 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 590 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 591 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 592 define amdgpu_kernel void @agent_one_as_seq_cst_acquire(
 593     i32* %out, i32 %in, i32 %old) {
 594 entry:
 595   %gep = getelementptr i32, i32* %out, i32 4
 596   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
 597   ret void
 598 }
 599
 600 ; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst:
 601 ; GCN:        s_waitcnt vmcnt(0){{$}}
 602 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 603 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 604 ; GFX8-NEXT:  s_waitcnt vmcnt(0){{$}}
 605 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 606 ; GFX8-NEXT:  buffer_wbinvl1_vol
 607 ; GFX10-NEXT: buffer_gl0_inv
 608 ; GFX10-NEXT: buffer_gl1_inv
 609 ; GFX10:         .amdhsa_kernel agent_one_as_seq_cst_seq_cst
 610 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 611 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 612 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 613 define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst(
 614     i32* %out, i32 %in, i32 %old) {
 615 entry:
 616   %gep = getelementptr i32, i32* %out, i32 4
 617   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
 618   ret void
 619 }
 620
 621 ; GCN-LABEL: {{^}}workgroup_one_as_monotonic_monotonic:
 622 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 623 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 624 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 625 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 626 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 627 ; GFX8-NOT:  buffer_wbinvl1_vol
 628 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 629 ; GFX10:         .amdhsa_kernel workgroup_one_as_monotonic_monotonic
 630 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 631 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 632 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 633 define amdgpu_kernel void @workgroup_one_as_monotonic_monotonic(
 634     i32* %out, i32 %in, i32 %old) {
 635 entry:
 636   %gep = getelementptr i32, i32* %out, i32 4
 637   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
 638   ret void
 639 }
 640
 641 ; GCN-LABEL:     {{^}}workgroup_one_as_acquire_monotonic:
 642 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
 643 ; GFX10-NOT:     s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 644 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 645 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 646 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 647 ; GFX10WGP-NEXT: buffer_gl0_inv
 648 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 649 ; GFX10CU-NOT:   buffer_gl0_inv
 650 ; GFX8-NOT:      buffer_wbinvl1_vol
 651 ; GFX10:         .amdhsa_kernel workgroup_one_as_acquire_monotonic
 652 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 653 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 654 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 655 define amdgpu_kernel void @workgroup_one_as_acquire_monotonic(
 656     i32* %out, i32 %in, i32 %old) {
 657 entry:
 658   %gep = getelementptr i32, i32* %out, i32 4
 659   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
 660   ret void
 661 }
 662
 663 ; GCN-LABEL:     {{^}}workgroup_one_as_release_monotonic:
 664 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 665 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
 666 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 667 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
 668 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 669 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 670 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
 671 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 672 ; GCN-NOT:       buffer_{{wbinvl1_vol|gl._inv}}
 673 ; GFX10:         .amdhsa_kernel workgroup_one_as_release_monotonic
 674 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 675 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 676 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 677 define amdgpu_kernel void @workgroup_one_as_release_monotonic(
 678     i32* %out, i32 %in, i32 %old) {
 679 entry:
 680   %gep = getelementptr i32, i32* %out, i32 4
 681   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
 682   ret void
 683 }
 684
 685 ; GCN-LABEL:     {{^}}workgroup_one_as_acq_rel_monotonic:
 686 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 687 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
 688 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 689 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
 690 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 691 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 692 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 693 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 694 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 695 ; GFX8-NOT:      buffer_wbinvl1_vol
 696 ; GFX10WGP-NEXT: buffer_gl0_inv
 697 ; GFX10CU-NOT:   buffer_gl0_inv
 698 ; GFX10:         .amdhsa_kernel workgroup_one_as_acq_rel_monotonic
 699 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 700 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 701 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 702 define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic(
 703     i32* %out, i32 %in, i32 %old) {
 704 entry:
 705   %gep = getelementptr i32, i32* %out, i32 4
 706   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
 707   ret void
 708 }
 709
 710 ; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst_monotonic:
 711 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 712 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
 713 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 714 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
 715 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 716 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 717 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 718 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 719 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 720 ; GFX8-NOT:      buffer_wbinvl1_vol
 721 ; GFX10WGP-NEXT: buffer_gl0_inv
 722 ; GFX10CU-NOT:   buffer_gl0_inv
 723 ; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst_monotonic
 724 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 725 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 726 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 727 define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic(
 728     i32* %out, i32 %in, i32 %old) {
 729 entry:
 730   %gep = getelementptr i32, i32* %out, i32 4
 731   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
 732   ret void
 733 }
 734
 735 ; GCN-LABEL:     {{^}}workgroup_one_as_acquire_acquire:
 736 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
 737 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 738 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 739 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 740 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 741 ; GFX10WGP-NEXT: buffer_gl0_inv
 742 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 743 ; GFX10CU-NOT:   buffer_gl0_inv
 744 ; GFX8-NOT:      buffer_wbinvl1_vol
 745 ; GFX10:         .amdhsa_kernel workgroup_one_as_acquire_acquire
 746 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 747 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 748 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 749 define amdgpu_kernel void @workgroup_one_as_acquire_acquire(
 750     i32* %out, i32 %in, i32 %old) {
 751 entry:
 752   %gep = getelementptr i32, i32* %out, i32 4
 753   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
 754   ret void
 755 }
 756
 757 ; GCN-LABEL:     {{^}}workgroup_one_as_release_acquire:
 758 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 759 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
 760 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 761 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
 762 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 763 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 764 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 765 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 766 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 767 ; GFX8-NOT:      buffer_wbinvl1_vol
 768 ; GFX10WGP-NEXT: buffer_gl0_inv
 769 ; GFX10CU-NOT:   buffer_gl0_inv
 770 ; GFX10:         .amdhsa_kernel workgroup_one_as_release_acquire
 771 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 772 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 773 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 774 define amdgpu_kernel void @workgroup_one_as_release_acquire(
 775     i32* %out, i32 %in, i32 %old) {
 776 entry:
 777   %gep = getelementptr i32, i32* %out, i32 4
 778   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
 779   ret void
 780 }
 781
 782 ; GCN-LABEL:     {{^}}workgroup_one_as_acq_rel_acquire:
 783 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 784 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
 785 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 786 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
 787 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 788 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 789 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 790 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 791 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 792 ; GFX8-NOT:      buffer_wbinvl1_vol
 793 ; GFX10WGP:      buffer_gl0_inv
 794 ; GFX10CU-NOT:   buffer_gl0_inv
 795 ; GFX10:         .amdhsa_kernel workgroup_one_as_acq_rel_acquire
 796 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 797 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 798 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 799 define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire(
 800     i32* %out, i32 %in, i32 %old) {
 801 entry:
 802   %gep = getelementptr i32, i32* %out, i32 4
 803   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
 804   ret void
 805 }
 806
 807 ; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst_acquire:
 808 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 809 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
 810 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 811 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
 812 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 813 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 814 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 815 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 816 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 817 ; GFX8-NOT:      buffer_wbinvl1_vol
 818 ; GFX10WGP-NEXT: buffer_gl0_inv
 819 ; GFX10CU-NOT:   buffer_gl0_inv
 820 ; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst_acquire
 821 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 822 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 823 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 824 define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire(
 825     i32* %out, i32 %in, i32 %old) {
 826 entry:
 827   %gep = getelementptr i32, i32* %out, i32 4
 828   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
 829   ret void
 830 }
 831
 832 ; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst_seq_cst:
 833 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 834 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
 835 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 836 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
 837 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 838 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 839 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
 840 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
 841 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
 842 ; GFX8-NOT:      buffer_wbinvl1_vol
 843 ; GFX10WGP:      buffer_gl0_inv
 844 ; GFX10CU-NOT:   buffer_gl0_inv
 845 ; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst_seq_cst
 846 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 847 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 848 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 849 define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst(
 850     i32* %out, i32 %in, i32 %old) {
 851 entry:
 852   %gep = getelementptr i32, i32* %out, i32 4
 853   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
 854   ret void
 855 }
 856
 857 ; GCN-LABEL: {{^}}wavefront_one_as_monotonic_monotonic:
 858 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 859 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 860 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 861 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 862 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 863 ; GFX8-NOT:  buffer_wbinvl1_vol
 864 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 865 ; GFX10:         .amdhsa_kernel wavefront_one_as_monotonic_monotonic
 866 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 867 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 868 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 869 define amdgpu_kernel void @wavefront_one_as_monotonic_monotonic(
 870     i32* %out, i32 %in, i32 %old) {
 871 entry:
 872   %gep = getelementptr i32, i32* %out, i32 4
 873   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
 874   ret void
 875 }
 876
 877 ; GCN-LABEL: {{^}}wavefront_one_as_acquire_monotonic:
 878 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 879 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 880 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 881 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 882 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 883 ; GFX8-NOT:  buffer_wbinvl1_vol
 884 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 885 ; GFX10:         .amdhsa_kernel wavefront_one_as_acquire_monotonic
 886 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 887 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 888 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 889 define amdgpu_kernel void @wavefront_one_as_acquire_monotonic(
 890     i32* %out, i32 %in, i32 %old) {
 891 entry:
 892   %gep = getelementptr i32, i32* %out, i32 4
 893   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
 894   ret void
 895 }
 896
 897 ; GCN-LABEL: {{^}}wavefront_one_as_release_monotonic:
 898 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 899 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 900 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 901 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 902 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 903 ; GFX8-NOT:  buffer_wbinvl1_vol
 904 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 905 ; GFX10:         .amdhsa_kernel wavefront_one_as_release_monotonic
 906 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 907 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 908 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 909 define amdgpu_kernel void @wavefront_one_as_release_monotonic(
 910     i32* %out, i32 %in, i32 %old) {
 911 entry:
 912   %gep = getelementptr i32, i32* %out, i32 4
 913   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
 914   ret void
 915 }
 916
 917 ; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_monotonic:
 918 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 919 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 920 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 921 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 922 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 923 ; GFX8-NOT:  buffer_wbinvl1_vol
 924 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 925 ; GFX10:         .amdhsa_kernel wavefront_one_as_acq_rel_monotonic
 926 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 927 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 928 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 929 define amdgpu_kernel void @wavefront_one_as_acq_rel_monotonic(
 930     i32* %out, i32 %in, i32 %old) {
 931 entry:
 932   %gep = getelementptr i32, i32* %out, i32 4
 933   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
 934   ret void
 935 }
 936
 937 ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_monotonic:
 938 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 939 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 940 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 941 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 942 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 943 ; GFX8-NOT:  buffer_wbinvl1_vol
 944 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 945 ; GFX10:         .amdhsa_kernel wavefront_one_as_seq_cst_monotonic
 946 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 947 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 948 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 949 define amdgpu_kernel void @wavefront_one_as_seq_cst_monotonic(
 950     i32* %out, i32 %in, i32 %old) {
 951 entry:
 952   %gep = getelementptr i32, i32* %out, i32 4
 953   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
 954   ret void
 955 }
 956
 957 ; GCN-LABEL: {{^}}wavefront_one_as_acquire_acquire:
 958 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 959 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 960 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 961 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 962 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 963 ; GFX8-NOT:  buffer_wbinvl1_vol
 964 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 965 ; GFX10:         .amdhsa_kernel wavefront_one_as_acquire_acquire
 966 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 967 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 968 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 969 define amdgpu_kernel void @wavefront_one_as_acquire_acquire(
 970     i32* %out, i32 %in, i32 %old) {
 971 entry:
 972   %gep = getelementptr i32, i32* %out, i32 4
 973   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
 974   ret void
 975 }
 976
 977 ; GCN-LABEL: {{^}}wavefront_one_as_release_acquire:
 978 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 979 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 980 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 981 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 982 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 983 ; GFX8-NOT:  buffer_wbinvl1_vol
 984 ; GFX10-NOT: buffer_gl{{[01]}}._inv
 985 ; GFX10:         .amdhsa_kernel wavefront_one_as_release_acquire
 986 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
 987 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
 988 ; GFX10-NOT:     .amdhsa_memory_ordered 0
 989 define amdgpu_kernel void @wavefront_one_as_release_acquire(
 990     i32* %out, i32 %in, i32 %old) {
 991 entry:
 992   %gep = getelementptr i32, i32* %out, i32 4
 993   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
 994   ret void
 995 }
 996
 997 ; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_acquire:
 998 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 999 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1000 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1001 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1002 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1003 ; GFX8-NOT:  buffer_wbinvl1_vol
1004 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1005 ; GFX10:         .amdhsa_kernel wavefront_one_as_acq_rel_acquire
1006 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1007 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1008 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1009 define amdgpu_kernel void @wavefront_one_as_acq_rel_acquire(
1010     i32* %out, i32 %in, i32 %old) {
1011 entry:
1012   %gep = getelementptr i32, i32* %out, i32 4
1013   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
1014   ret void
1015 }
1016
1017 ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_acquire:
1018 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1019 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1020 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1021 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1022 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1023 ; GFX8-NOT:  buffer_wbinvl1_vol
1024 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1025 ; GFX10:         .amdhsa_kernel wavefront_one_as_seq_cst_acquire
1026 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1027 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1028 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1029 define amdgpu_kernel void @wavefront_one_as_seq_cst_acquire(
1030     i32* %out, i32 %in, i32 %old) {
1031 entry:
1032   %gep = getelementptr i32, i32* %out, i32 4
1033   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
1034   ret void
1035 }
1036
1037 ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_seq_cst:
1038 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1039 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1040 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1041 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1042 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1043 ; GFX8-NOT:  buffer_wbinvl1_vol
1044 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1045 ; GFX10:         .amdhsa_kernel wavefront_one_as_seq_cst_seq_cst
1046 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1047 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1048 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1049 define amdgpu_kernel void @wavefront_one_as_seq_cst_seq_cst(
1050     i32* %out, i32 %in, i32 %old) {
1051 entry:
1052   %gep = getelementptr i32, i32* %out, i32 4
1053   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
1054   ret void
1055 }
1056
1057 ; GCN-LABEL: {{^}}system_one_as_acquire_monotonic_ret:
1058 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
1059 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1060 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1061 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1062 ; GFX8-NEXT:  buffer_wbinvl1_vol
1063 ; GFX10-NEXT: buffer_gl0_inv
1064 ; GFX10-NEXT: buffer_gl1_inv
1065 ; GFX10:         .amdhsa_kernel system_one_as_acquire_monotonic_ret
1066 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1067 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1068 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1069 define amdgpu_kernel void @system_one_as_acquire_monotonic_ret(
1070     i32* %out, i32 %in, i32 %old) {
1071 entry:
1072   %gep = getelementptr i32, i32* %out, i32 4
1073   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
1074   %val0 = extractvalue { i32, i1 } %val, 0
1075   store i32 %val0, i32* %out, align 4
1076   ret void
1077 }
1078
1079 ; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic_ret:
1080 ; GCN:        s_waitcnt vmcnt(0){{$}}
1081 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1082 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1083 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1084 ; GFX8-NEXT:  buffer_wbinvl1_vol
1085 ; GFX10-NEXT: buffer_gl0_inv
1086 ; GFX10-NEXT: buffer_gl1_inv
1087 ; GFX10:         .amdhsa_kernel system_one_as_acq_rel_monotonic_ret
1088 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1089 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1090 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1091 define amdgpu_kernel void @system_one_as_acq_rel_monotonic_ret(
1092     i32* %out, i32 %in, i32 %old) {
1093 entry:
1094   %gep = getelementptr i32, i32* %out, i32 4
1095   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
1096   %val0 = extractvalue { i32, i1 } %val, 0
1097   store i32 %val0, i32* %out, align 4
1098   ret void
1099 }
1100
1101 ; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic_ret:
1102 ; GCN:        s_waitcnt vmcnt(0){{$}}
1103 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1104 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1105 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1106 ; GFX8-NEXT:  buffer_wbinvl1_vol
1107 ; GFX10-NEXT: buffer_gl0_inv
1108 ; GFX10-NEXT: buffer_gl1_inv
1109 ; GFX10:         .amdhsa_kernel system_one_as_seq_cst_monotonic_ret
1110 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1111 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1112 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1113 define amdgpu_kernel void @system_one_as_seq_cst_monotonic_ret(
1114     i32* %out, i32 %in, i32 %old) {
1115 entry:
1116   %gep = getelementptr i32, i32* %out, i32 4
1117   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
1118   %val0 = extractvalue { i32, i1 } %val, 0
1119   store i32 %val0, i32* %out, align 4
1120   ret void
1121 }
1122
1123 ; GCN-LABEL: {{^}}system_one_as_acquire_acquire_ret:
1124 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
1125 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1126 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1127 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1128 ; GFX8-NEXT:  buffer_wbinvl1_vol
1129 ; GFX10-NEXT: buffer_gl0_inv
1130 ; GFX10-NEXT: buffer_gl1_inv
1131 ; GFX10:         .amdhsa_kernel system_one_as_acquire_acquire_ret
1132 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1133 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1134 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1135 define amdgpu_kernel void @system_one_as_acquire_acquire_ret(
1136     i32* %out, i32 %in, i32 %old) {
1137 entry:
1138   %gep = getelementptr i32, i32* %out, i32 4
1139   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
1140   %val0 = extractvalue { i32, i1 } %val, 0
1141   store i32 %val0, i32* %out, align 4
1142   ret void
1143 }
1144
1145 ; GCN-LABEL: {{^}}system_one_as_release_acquire_ret:
1146 ; GCN:        s_waitcnt vmcnt(0){{$}}
1147 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1148 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1149 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1150 ; GFX8-NEXT:  buffer_wbinvl1_vol
1151 ; GFX10-NEXT: buffer_gl0_inv
1152 ; GFX10-NEXT: buffer_gl1_inv
1153 ; GFX10:         .amdhsa_kernel system_one_as_release_acquire_ret
1154 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1155 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1156 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1157 define amdgpu_kernel void @system_one_as_release_acquire_ret(
1158     i32* %out, i32 %in, i32 %old) {
1159 entry:
1160   %gep = getelementptr i32, i32* %out, i32 4
1161   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
1162   %val0 = extractvalue { i32, i1 } %val, 0
1163   store i32 %val0, i32* %out, align 4
1164   ret void
1165 }
1166
1167 ; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire_ret:
1168 ; GCN:        s_waitcnt vmcnt(0){{$}}
1169 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1170 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1171 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1172 ; GFX8-NEXT:  buffer_wbinvl1_vol
1173 ; GFX10-NEXT: buffer_gl0_inv
1174 ; GFX10-NEXT: buffer_gl1_inv
1175 ; GFX10:         .amdhsa_kernel system_one_as_acq_rel_acquire_ret
1176 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1177 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1178 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1179 define amdgpu_kernel void @system_one_as_acq_rel_acquire_ret(
1180     i32* %out, i32 %in, i32 %old) {
1181 entry:
1182   %gep = getelementptr i32, i32* %out, i32 4
1183   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
1184   %val0 = extractvalue { i32, i1 } %val, 0
1185   store i32 %val0, i32* %out, align 4
1186   ret void
1187 }
1188
1189 ; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire_ret:
1190 ; GCN:        s_waitcnt vmcnt(0){{$}}
1191 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1192 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1193 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1194 ; GFX8-NEXT:  buffer_wbinvl1_vol
1195 ; GFX10-NEXT: buffer_gl0_inv
1196 ; GFX10-NEXT: buffer_gl1_inv
1197 ; GFX10:         .amdhsa_kernel system_one_as_seq_cst_acquire_ret
1198 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1199 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1200 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1201 define amdgpu_kernel void @system_one_as_seq_cst_acquire_ret(
1202     i32* %out, i32 %in, i32 %old) {
1203 entry:
1204   %gep = getelementptr i32, i32* %out, i32 4
1205   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
1206   %val0 = extractvalue { i32, i1 } %val, 0
1207   store i32 %val0, i32* %out, align 4
1208   ret void
1209 }
1210
1211 ; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst_ret:
1212 ; GCN:        s_waitcnt vmcnt(0){{$}}
1213 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1214 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1215 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1216 ; GFX8-NEXT:  buffer_wbinvl1_vol
1217 ; GFX10-NEXT: buffer_gl0_inv
1218 ; GFX10-NEXT: buffer_gl1_inv
1219 ; GFX10:         .amdhsa_kernel system_one_as_seq_cst_seq_cst_ret
1220 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1221 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1222 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1223 define amdgpu_kernel void @system_one_as_seq_cst_seq_cst_ret(
1224     i32* %out, i32 %in, i32 %old) {
1225 entry:
1226   %gep = getelementptr i32, i32* %out, i32 4
1227   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
1228   %val0 = extractvalue { i32, i1 } %val, 0
1229   store i32 %val0, i32* %out, align 4
1230   ret void
1231 }
1232
1233 ; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic_ret:
1234 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
1235 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1236 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1237 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1238 ; GFX8-NEXT:  buffer_wbinvl1_vol
1239 ; GFX10-NEXT: buffer_gl0_inv
1240 ; GFX10-NEXT: buffer_gl1_inv
1241 ; GFX10:         .amdhsa_kernel agent_one_as_acquire_monotonic_ret
1242 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1243 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1244 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1245 define amdgpu_kernel void @agent_one_as_acquire_monotonic_ret(
1246     i32* %out, i32 %in, i32 %old) {
1247 entry:
1248   %gep = getelementptr i32, i32* %out, i32 4
1249   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
1250   %val0 = extractvalue { i32, i1 } %val, 0
1251   store i32 %val0, i32* %out, align 4
1252   ret void
1253 }
1254
1255 ; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic_ret:
1256 ; GCN:        s_waitcnt vmcnt(0){{$}}
1257 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1258 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1259 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1260 ; GFX8-NEXT:  buffer_wbinvl1_vol
1261 ; GFX10-NEXT: buffer_gl0_inv
1262 ; GFX10-NEXT: buffer_gl1_inv
1263 ; GFX10:         .amdhsa_kernel agent_one_as_acq_rel_monotonic_ret
1264 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1265 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1266 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1267 define amdgpu_kernel void @agent_one_as_acq_rel_monotonic_ret(
1268     i32* %out, i32 %in, i32 %old) {
1269 entry:
1270   %gep = getelementptr i32, i32* %out, i32 4
1271   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
1272   %val0 = extractvalue { i32, i1 } %val, 0
1273   store i32 %val0, i32* %out, align 4
1274   ret void
1275 }
1276
1277 ; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic_ret:
1278 ; GCN:        s_waitcnt vmcnt(0){{$}}
1279 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1280 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1281 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1282 ; GFX8-NEXT:  buffer_wbinvl1_vol
1283 ; GFX10-NEXT: buffer_gl0_inv
1284 ; GFX10-NEXT: buffer_gl1_inv
1285 ; GFX10:         .amdhsa_kernel agent_one_as_seq_cst_monotonic_ret
1286 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1287 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1288 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1289 define amdgpu_kernel void @agent_one_as_seq_cst_monotonic_ret(
1290     i32* %out, i32 %in, i32 %old) {
1291 entry:
1292   %gep = getelementptr i32, i32* %out, i32 4
1293   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
1294   %val0 = extractvalue { i32, i1 } %val, 0
1295   store i32 %val0, i32* %out, align 4
1296   ret void
1297 }
1298
1299 ; GCN-LABEL: {{^}}agent_one_as_acquire_acquire_ret:
1300 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
1301 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1302 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1303 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1304 ; GFX8-NEXT:  buffer_wbinvl1_vol
1305 ; GFX10-NEXT: buffer_gl0_inv
1306 ; GFX10-NEXT: buffer_gl1_inv
1307 ; GFX10:         .amdhsa_kernel agent_one_as_acquire_acquire_ret
1308 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1309 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1310 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1311 define amdgpu_kernel void @agent_one_as_acquire_acquire_ret(
1312     i32* %out, i32 %in, i32 %old) {
1313 entry:
1314   %gep = getelementptr i32, i32* %out, i32 4
1315   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
1316   %val0 = extractvalue { i32, i1 } %val, 0
1317   store i32 %val0, i32* %out, align 4
1318   ret void
1319 }
1320
1321 ; GCN-LABEL: {{^}}agent_one_as_release_acquire_ret:
1322 ; GCN:        s_waitcnt vmcnt(0){{$}}
1323 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1324 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1325 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1326 ; GFX8-NEXT:  buffer_wbinvl1_vol
1327 ; GFX10-NEXT: buffer_gl0_inv
1328 ; GFX10-NEXT: buffer_gl1_inv
1329 ; GFX10:         .amdhsa_kernel agent_one_as_release_acquire_ret
1330 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1331 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1332 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1333 define amdgpu_kernel void @agent_one_as_release_acquire_ret(
1334     i32* %out, i32 %in, i32 %old) {
1335 entry:
1336   %gep = getelementptr i32, i32* %out, i32 4
1337   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
1338   %val0 = extractvalue { i32, i1 } %val, 0
1339   store i32 %val0, i32* %out, align 4
1340   ret void
1341 }
1342
1343 ; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire_ret:
1344 ; GCN:        s_waitcnt vmcnt(0){{$}}
1345 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1346 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1347 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1348 ; GFX8-NEXT:  buffer_wbinvl1_vol
1349 ; GFX10-NEXT: buffer_gl0_inv
1350 ; GFX10-NEXT: buffer_gl1_inv
1351 ; GFX10:         .amdhsa_kernel agent_one_as_acq_rel_acquire_ret
1352 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1353 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1354 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1355 define amdgpu_kernel void @agent_one_as_acq_rel_acquire_ret(
1356     i32* %out, i32 %in, i32 %old) {
1357 entry:
1358   %gep = getelementptr i32, i32* %out, i32 4
1359   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
1360   %val0 = extractvalue { i32, i1 } %val, 0
1361   store i32 %val0, i32* %out, align 4
1362   ret void
1363 }
1364
1365 ; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire_ret:
1366 ; GCN:        s_waitcnt vmcnt(0){{$}}
1367 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1368 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1369 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1370 ; GFX8-NEXT:  buffer_wbinvl1_vol
1371 ; GFX10-NEXT: buffer_gl0_inv
1372 ; GFX10-NEXT: buffer_gl1_inv
1373 ; GFX10:         .amdhsa_kernel agent_one_as_seq_cst_acquire_ret
1374 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1375 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1376 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1377 define amdgpu_kernel void @agent_one_as_seq_cst_acquire_ret(
1378     i32* %out, i32 %in, i32 %old) {
1379 entry:
1380   %gep = getelementptr i32, i32* %out, i32 4
1381   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
1382   %val0 = extractvalue { i32, i1 } %val, 0
1383   store i32 %val0, i32* %out, align 4
1384   ret void
1385 }
1386
1387 ; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst_ret:
1388 ; GCN:        s_waitcnt vmcnt(0){{$}}
1389 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1390 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1391 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
1392 ; GFX8-NEXT:  buffer_wbinvl1_vol
1393 ; GFX10-NEXT: buffer_gl0_inv
1394 ; GFX10-NEXT: buffer_gl1_inv
1395 ; GFX10:         .amdhsa_kernel agent_one_as_seq_cst_seq_cst_ret
1396 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1397 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1398 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1399 define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst_ret(
1400     i32* %out, i32 %in, i32 %old) {
1401 entry:
1402   %gep = getelementptr i32, i32* %out, i32 4
1403   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
1404   %val0 = extractvalue { i32, i1 } %val, 0
1405   store i32 %val0, i32* %out, align 4
1406   ret void
1407 }
1408
1409 ; GCN-LABEL:     {{^}}workgroup_one_as_acquire_monotonic_ret:
1410 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
1411 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1412 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1413 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1414 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1415 ; GFX10WGP-NEXT: buffer_gl0_inv
1416 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1417 ; GFX10CU-NOT:   buffer_gl0_inv
1418 ; GFX8-NOT:      buffer_wbinvl1_vol
1419 ; GFX10:         .amdhsa_kernel workgroup_one_as_acquire_monotonic_ret
1420 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1421 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1422 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1423 define amdgpu_kernel void @workgroup_one_as_acquire_monotonic_ret(
1424     i32* %out, i32 %in, i32 %old) {
1425 entry:
1426   %gep = getelementptr i32, i32* %out, i32 4
1427   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
1428   %val0 = extractvalue { i32, i1 } %val, 0
1429   store i32 %val0, i32* %out, align 4
1430   ret void
1431 }
1432
1433 ; GCN-LABEL:     {{^}}workgroup_one_as_acq_rel_monotonic_ret:
1434 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1435 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
1436 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1437 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1438 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
1439 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1440 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1441 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1442 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1443 ; GFX8-NOT:      buffer_wbinvl1_vol
1444 ; GFX10WGP-NEXT: buffer_gl0_inv
1445 ; GFX10CU-NOT:   buffer_gl0_inv
1446 ; GFX10:         .amdhsa_kernel workgroup_one_as_acq_rel_monotonic_ret
1447 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1448 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1449 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1450 define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic_ret(
1451     i32* %out, i32 %in, i32 %old) {
1452 entry:
1453   %gep = getelementptr i32, i32* %out, i32 4
1454   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
1455   %val0 = extractvalue { i32, i1 } %val, 0
1456   store i32 %val0, i32* %out, align 4
1457   ret void
1458 }
1459
1460 ; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst_monotonic_ret:
1461 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1462 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
1463 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1464 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1465 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
1466 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1467 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1468 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1469 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1470 ; GFX8-NOT:      buffer_wbinvl1_vol
1471 ; GFX10WGP-NEXT: buffer_gl0_inv
1472 ; GFX10CU-NOT:   buffer_gl0_inv
1473 ; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst_monotonic_ret
1474 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1475 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1476 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1477 define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic_ret(
1478     i32* %out, i32 %in, i32 %old) {
1479 entry:
1480   %gep = getelementptr i32, i32* %out, i32 4
1481   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
1482   %val0 = extractvalue { i32, i1 } %val, 0
1483   store i32 %val0, i32* %out, align 4
1484   ret void
1485 }
1486
1487 ; GCN-LABEL:     {{^}}workgroup_one_as_acquire_acquire_ret:
1488 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
1489 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1490 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1491 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1492 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1493 ; GFX10WGP-NEXT: buffer_gl0_inv
1494 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1495 ; GFX10CU-NOT:   buffer_gl0_inv
1496 ; GFX8-NOT:      buffer_wbinvl1_vol
1497 ; GFX10:         .amdhsa_kernel workgroup_one_as_acquire_acquire_ret
1498 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1499 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1500 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1501 define amdgpu_kernel void @workgroup_one_as_acquire_acquire_ret(
1502     i32* %out, i32 %in, i32 %old) {
1503 entry:
1504   %gep = getelementptr i32, i32* %out, i32 4
1505   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
1506   %val0 = extractvalue { i32, i1 } %val, 0
1507   store i32 %val0, i32* %out, align 4
1508   ret void
1509 }
1510
1511 ; GCN-LABEL:     {{^}}workgroup_one_as_release_acquire_ret:
1512 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1513 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
1514 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1515 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1516 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
1517 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1518 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1519 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1520 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1521 ; GFX8-NOT:      buffer_wbinvl1_vol
1522 ; GFX10WGP-NEXT: buffer_gl0_inv
1523 ; GFX10CU-NOT:   buffer_gl0_inv
1524 ; GFX10:         .amdhsa_kernel workgroup_one_as_release_acquire_ret
1525 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1526 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1527 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1528 define amdgpu_kernel void @workgroup_one_as_release_acquire_ret(
1529     i32* %out, i32 %in, i32 %old) {
1530 entry:
1531   %gep = getelementptr i32, i32* %out, i32 4
1532   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
1533   %val0 = extractvalue { i32, i1 } %val, 0
1534   store i32 %val0, i32* %out, align 4
1535   ret void
1536 }
1537
1538 ; GCN-LABEL:     {{^}}workgroup_one_as_acq_rel_acquire_ret:
1539 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1540 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
1541 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1542 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1543 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
1544 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1545 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1546 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1547 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1548 ; GFX8-NOT:      buffer_wbinvl1_vol
1549 ; GFX10WGP:      buffer_gl0_inv
1550 ; GFX10CU-NOT:   buffer_gl0_inv
1551 ; GFX10:         .amdhsa_kernel workgroup_one_as_acq_rel_acquire_ret
1552 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1553 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1554 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1555 define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire_ret(
1556     i32* %out, i32 %in, i32 %old) {
1557 entry:
1558   %gep = getelementptr i32, i32* %out, i32 4
1559   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
1560   %val0 = extractvalue { i32, i1 } %val, 0
1561   store i32 %val0, i32* %out, align 4
1562   ret void
1563 }
1564
1565 ; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst_acquire_ret:
1566 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1567 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
1568 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1569 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1570 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
1571 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1572 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1573 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1574 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1575 ; GFX8-NOT:      buffer_wbinvl1_vol
1576 ; GFX10WGP-NEXT: buffer_gl0_inv
1577 ; GFX10CU-NOT:   buffer_gl0_inv
1578 ; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst_acquire_ret
1579 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1580 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1581 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1582 define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire_ret(
1583     i32* %out, i32 %in, i32 %old) {
1584 entry:
1585   %gep = getelementptr i32, i32* %out, i32 4
1586   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
1587   %val0 = extractvalue { i32, i1 } %val, 0
1588   store i32 %val0, i32* %out, align 4
1589   ret void
1590 }
1591
1592 ; GCN-LABEL:     {{^}}workgroup_one_as_seq_cst_seq_cst_ret:
1593 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1594 ; GFX10WGP:      s_waitcnt vmcnt(0){{$}}
1595 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1596 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1597 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
1598 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
1599 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
1600 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
1601 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
1602 ; GFX8-NOT:      buffer_wbinvl1_vol
1603 ; GFX10WGP:      buffer_gl0_inv
1604 ; GFX10CU-NOT:   buffer_gl0_inv
1605 ; GFX10:         .amdhsa_kernel workgroup_one_as_seq_cst_seq_cst_ret
1606 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1607 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1608 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1609 define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst_ret(
1610     i32* %out, i32 %in, i32 %old) {
1611 entry:
1612   %gep = getelementptr i32, i32* %out, i32 4
1613   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
1614   %val0 = extractvalue { i32, i1 } %val, 0
1615   store i32 %val0, i32* %out, align 4
1616   ret void
1617 }
1618
1619 ; GCN-LABEL: {{^}}system_monotonic_monotonic:
1620 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1621 ; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1622 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1623 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1624 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1625 ; GFX8-NOT:  buffer_wbinvl1_vol
1626 ; GFX10-NOT: buffer_gl{{[01]}}_inv
1627 ; GFX10:         .amdhsa_kernel system_monotonic_monotonic
1628 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1629 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1630 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1631 define amdgpu_kernel void @system_monotonic_monotonic(
1632     i32* %out, i32 %in, i32 %old) {
1633 entry:
1634   %gep = getelementptr i32, i32* %out, i32 4
1635   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic
1636   ret void
1637 }
1638
1639 ; GCN-LABEL: {{^}}system_acquire_monotonic:
1640 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
1641 ; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1642 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1643 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1644 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1645 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1646 ; GFX8-NEXT:  buffer_wbinvl1_vol
1647 ; GFX10-NEXT: buffer_gl0_inv
1648 ; GFX10-NEXT: buffer_gl1_inv
1649 ; GFX10:         .amdhsa_kernel system_acquire_monotonic
1650 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1651 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1652 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1653 define amdgpu_kernel void @system_acquire_monotonic(
1654     i32* %out, i32 %in, i32 %old) {
1655 entry:
1656   %gep = getelementptr i32, i32* %out, i32 4
1657   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic
1658   ret void
1659 }
1660
1661 ; GCN-LABEL: {{^}}system_release_monotonic:
1662 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1663 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
1664 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
1665 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1666 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
1667 ; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1668 ; GFX8-NOT:   buffer_wbinvl1_vol
1669 ; GFX10-NOT:  buffer_gl._inv
1670 ; GFX10:         .amdhsa_kernel system_release_monotonic
1671 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1672 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1673 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1674 define amdgpu_kernel void @system_release_monotonic(
1675     i32* %out, i32 %in, i32 %old) {
1676 entry:
1677   %gep = getelementptr i32, i32* %out, i32 4
1678   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic
1679   ret void
1680 }
1681
1682 ; GCN-LABEL: {{^}}system_acq_rel_monotonic:
1683 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1684 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
1685 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
1686 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1687 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1688 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1689 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1690 ; GFX8-NEXT:  buffer_wbinvl1_vol
1691 ; GFX10-NEXT: buffer_gl0_inv
1692 ; GFX10-NEXT: buffer_gl1_inv
1693 ; GFX10:         .amdhsa_kernel system_acq_rel_monotonic
1694 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1695 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1696 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1697 define amdgpu_kernel void @system_acq_rel_monotonic(
1698     i32* %out, i32 %in, i32 %old) {
1699 entry:
1700   %gep = getelementptr i32, i32* %out, i32 4
1701   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic
1702   ret void
1703 }
1704
1705 ; GCN-LABEL: {{^}}system_seq_cst_monotonic:
1706 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1707 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
1708 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
1709 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1710 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1711 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1712 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1713 ; GFX8-NEXT:  buffer_wbinvl1_vol
1714 ; GFX10-NEXT: buffer_gl0_inv
1715 ; GFX10-NEXT: buffer_gl1_inv
1716 ; GFX10:         .amdhsa_kernel system_seq_cst_monotonic
1717 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1718 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1719 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1720 define amdgpu_kernel void @system_seq_cst_monotonic(
1721     i32* %out, i32 %in, i32 %old) {
1722 entry:
1723   %gep = getelementptr i32, i32* %out, i32 4
1724   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic
1725   ret void
1726 }
1727
1728 ; GCN-LABEL: {{^}}system_acquire_acquire:
1729 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
1730 ; GFX10-NOT:  s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1731 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1732 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1733 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1734 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1735 ; GFX8-NEXT:  buffer_wbinvl1_vol
1736 ; GFX10-NEXT: buffer_gl0_inv
1737 ; GFX10-NEXT: buffer_gl1_inv
1738 ; GFX10:         .amdhsa_kernel system_acquire_acquire
1739 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1740 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1741 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1742 define amdgpu_kernel void @system_acquire_acquire(
1743     i32* %out, i32 %in, i32 %old) {
1744 entry:
1745   %gep = getelementptr i32, i32* %out, i32 4
1746   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire
1747   ret void
1748 }
1749
1750 ; GCN-LABEL: {{^}}system_release_acquire:
1751 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1752 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
1753 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
1754 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1755 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1756 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1757 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1758 ; GFX8-NEXT:  buffer_wbinvl1_vol
1759 ; GFX10-NEXT: buffer_gl0_inv
1760 ; GFX10-NEXT: buffer_gl1_inv
1761 ; GFX10:         .amdhsa_kernel system_release_acquire
1762 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1763 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1764 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1765 define amdgpu_kernel void @system_release_acquire(
1766     i32* %out, i32 %in, i32 %old) {
1767 entry:
1768   %gep = getelementptr i32, i32* %out, i32 4
1769   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire
1770   ret void
1771 }
1772
1773 ; GCN-LABEL: {{^}}system_acq_rel_acquire:
1774 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1775 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
1776 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
1777 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1778 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1779 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1780 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1781 ; GFX8-NEXT:  buffer_wbinvl1_vol
1782 ; GFX10-NEXT: buffer_gl0_inv
1783 ; GFX10-NEXT: buffer_gl1_inv
1784 ; GFX10:         .amdhsa_kernel system_acq_rel_acquire
1785 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1786 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1787 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1788 define amdgpu_kernel void @system_acq_rel_acquire(
1789     i32* %out, i32 %in, i32 %old) {
1790 entry:
1791   %gep = getelementptr i32, i32* %out, i32 4
1792   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire
1793   ret void
1794 }
1795
1796 ; GCN-LABEL: {{^}}system_seq_cst_acquire:
1797 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1798 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
1799 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
1800 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1801 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1802 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1803 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1804 ; GFX8-NEXT:  buffer_wbinvl1_vol
1805 ; GFX10-NEXT: buffer_gl0_inv
1806 ; GFX10-NEXT: buffer_gl1_inv
1807 ; GFX10:         .amdhsa_kernel system_seq_cst_acquire
1808 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1809 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1810 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1811 define amdgpu_kernel void @system_seq_cst_acquire(
1812     i32* %out, i32 %in, i32 %old) {
1813 entry:
1814   %gep = getelementptr i32, i32* %out, i32 4
1815   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire
1816   ret void
1817 }
1818
1819 ; GCN-LABEL: {{^}}system_seq_cst_seq_cst:
1820 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1821 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
1822 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
1823 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1824 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
1825 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
1826 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
1827 ; GFX8-NEXT:  buffer_wbinvl1_vol
1828 ; GFX10-NEXT: buffer_gl0_inv
1829 ; GFX10-NEXT: buffer_gl1_inv
1830 ; GFX10:         .amdhsa_kernel system_seq_cst_seq_cst
1831 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1832 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1833 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1834 define amdgpu_kernel void @system_seq_cst_seq_cst(
1835     i32* %out, i32 %in, i32 %old) {
1836 entry:
1837   %gep = getelementptr i32, i32* %out, i32 4
1838   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
1839   ret void
1840 }
1841
1842 ; GCN-LABEL: {{^}}singlethread_monotonic_monotonic:
1843 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1844 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1845 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1846 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1847 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1848 ; GFX8-NOT:  buffer_wbinvl1_vol
1849 ; GFX10-NOT: buffer_gl{{[01]}}_inv
1850 ; GFX10:         .amdhsa_kernel singlethread_monotonic_monotonic
1851 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1852 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1853 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1854 define amdgpu_kernel void @singlethread_monotonic_monotonic(
1855     i32* %out, i32 %in, i32 %old) {
1856 entry:
1857   %gep = getelementptr i32, i32* %out, i32 4
1858   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
1859   ret void
1860 }
1861
1862 ; GCN-LABEL: {{^}}singlethread_acquire_monotonic:
1863 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1864 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1865 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1866 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1867 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1868 ; GFX8-NOT:  buffer_wbinvl1_vol
1869 ; GFX10-NOT: buffer_gl{{[01]}}_inv
1870 ; GFX10:         .amdhsa_kernel singlethread_acquire_monotonic
1871 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1872 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1873 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1874 define amdgpu_kernel void @singlethread_acquire_monotonic(
1875     i32* %out, i32 %in, i32 %old) {
1876 entry:
1877   %gep = getelementptr i32, i32* %out, i32 4
1878   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
1879   ret void
1880 }
1881
1882 ; GCN-LABEL: {{^}}singlethread_release_monotonic:
1883 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1884 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1885 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1886 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1887 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1888 ; Gfx8-NOT:  buffer_wbinvl1_vol
1889 ; GCN-NOT:   buffer_gl{{[01]}}_inv
1890 ; GFX10:         .amdhsa_kernel singlethread_release_monotonic
1891 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1892 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1893 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1894 define amdgpu_kernel void @singlethread_release_monotonic(
1895     i32* %out, i32 %in, i32 %old) {
1896 entry:
1897   %gep = getelementptr i32, i32* %out, i32 4
1898   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
1899   ret void
1900 }
1901
1902 ; GCN-LABEL: {{^}}singlethread_acq_rel_monotonic:
1903 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1904 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1905 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1906 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1907 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1908 ; GFX8-NOT:  buffer_wbinvl1_vol
1909 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1910 ; GFX10:         .amdhsa_kernel singlethread_acq_rel_monotonic
1911 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1912 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1913 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1914 define amdgpu_kernel void @singlethread_acq_rel_monotonic(
1915     i32* %out, i32 %in, i32 %old) {
1916 entry:
1917   %gep = getelementptr i32, i32* %out, i32 4
1918   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
1919   ret void
1920 }
1921
1922 ; GCN-LABEL: {{^}}singlethread_seq_cst_monotonic:
1923 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1924 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1925 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1926 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1927 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1928 ; GFX8-NOT:  buffer_wbinvl1_vol
1929 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1930 ; GFX10:         .amdhsa_kernel singlethread_seq_cst_monotonic
1931 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1932 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1933 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1934 define amdgpu_kernel void @singlethread_seq_cst_monotonic(
1935     i32* %out, i32 %in, i32 %old) {
1936 entry:
1937   %gep = getelementptr i32, i32* %out, i32 4
1938   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
1939   ret void
1940 }
1941
1942 ; GCN-LABEL: {{^}}singlethread_acquire_acquire:
1943 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1944 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1945 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1946 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1947 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1948 ; GFX8-NOT:  buffer_wbinvl1_vol
1949 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1950 ; GFX10:         .amdhsa_kernel singlethread_acquire_acquire
1951 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1952 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1953 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1954 define amdgpu_kernel void @singlethread_acquire_acquire(
1955     i32* %out, i32 %in, i32 %old) {
1956 entry:
1957   %gep = getelementptr i32, i32* %out, i32 4
1958   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
1959   ret void
1960 }
1961
1962 ; GCN-LABEL: {{^}}singlethread_release_acquire:
1963 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1964 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1965 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1966 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1967 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1968 ; GFX8-NOT:  buffer_wbinvl1_vol
1969 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1970 ; GFX10:         .amdhsa_kernel singlethread_release_acquire
1971 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1972 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1973 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1974 define amdgpu_kernel void @singlethread_release_acquire(
1975     i32* %out, i32 %in, i32 %old) {
1976 entry:
1977   %gep = getelementptr i32, i32* %out, i32 4
1978   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
1979   ret void
1980 }
1981
1982 ; GCN-LABEL: {{^}}singlethread_acq_rel_acquire:
1983 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1984 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1985 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
1986 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
1987 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1988 ; GFX8-NOT:  buffer_wbinvl1_vol
1989 ; GFX10-NOT: buffer_gl{{[01]}}._inv
1990 ; GFX10:         .amdhsa_kernel singlethread_acq_rel_acquire
1991 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
1992 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
1993 ; GFX10-NOT:     .amdhsa_memory_ordered 0
1994 define amdgpu_kernel void @singlethread_acq_rel_acquire(
1995     i32* %out, i32 %in, i32 %old) {
1996 entry:
1997   %gep = getelementptr i32, i32* %out, i32 4
1998   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
1999   ret void
2000 }
2001
2002 ; GCN-LABEL: {{^}}singlethread_seq_cst_acquire:
2003 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2004 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2005 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2006 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2007 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2008 ; GFX8-NOT:  buffer_wbinvl1_vol
2009 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2010 ; GFX10:         .amdhsa_kernel singlethread_seq_cst_acquire
2011 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2012 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2013 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2014 define amdgpu_kernel void @singlethread_seq_cst_acquire(
2015     i32* %out, i32 %in, i32 %old) {
2016 entry:
2017   %gep = getelementptr i32, i32* %out, i32 4
2018   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
2019   ret void
2020 }
2021
2022 ; GCN-LABEL: {{^}}singlethread_seq_cst_seq_cst:
2023 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2024 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2025 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2026 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2027 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2028 ; GFX8-NOT:  buffer_wbinvl1_vol
2029 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2030 ; GFX10:         .amdhsa_kernel singlethread_seq_cst_seq_cst
2031 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2032 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2033 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2034 define amdgpu_kernel void @singlethread_seq_cst_seq_cst(
2035     i32* %out, i32 %in, i32 %old) {
2036 entry:
2037   %gep = getelementptr i32, i32* %out, i32 4
2038   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
2039   ret void
2040 }
2041
2042 ; GCN-LABEL: {{^}}agent_monotonic_monotonic:
2043 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2044 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2045 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2046 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2047 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2048 ; GFX8-NOT:  buffer_wbinvl1_vol
2049 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2050 ; GFX10:         .amdhsa_kernel agent_monotonic_monotonic
2051 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2052 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2053 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2054 define amdgpu_kernel void @agent_monotonic_monotonic(
2055     i32* %out, i32 %in, i32 %old) {
2056 entry:
2057   %gep = getelementptr i32, i32* %out, i32 4
2058   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
2059   ret void
2060 }
2061
2062 ; GCN-LABEL: {{^}}agent_acquire_monotonic:
2063 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
2064 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2065 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2066 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2067 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2068 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2069 ; GFX8-NEXT:  buffer_wbinvl1_vol
2070 ; GFX10-NEXT: buffer_gl0_inv
2071 ; GFX10-NEXT: buffer_gl1_inv
2072 ; GFX10:         .amdhsa_kernel agent_acquire_monotonic
2073 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2074 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2075 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2076 define amdgpu_kernel void @agent_acquire_monotonic(
2077     i32* %out, i32 %in, i32 %old) {
2078 entry:
2079   %gep = getelementptr i32, i32* %out, i32 4
2080   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
2081   ret void
2082 }
2083
2084 ; GCN-LABEL: {{^}}agent_release_monotonic:
2085 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2086 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2087 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2088 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2089 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
2090 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2091 ; GCN-NOT:    buffer_{{wbinvl1_vol|gl._inv}}
2092 ; GFX10:         .amdhsa_kernel agent_release_monotonic
2093 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2094 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2095 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2096 define amdgpu_kernel void @agent_release_monotonic(
2097     i32* %out, i32 %in, i32 %old) {
2098 entry:
2099   %gep = getelementptr i32, i32* %out, i32 4
2100   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic
2101   ret void
2102 }
2103
2104 ; GCN-LABEL: {{^}}agent_acq_rel_monotonic:
2105 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2106 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2107 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2108 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2109 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2110 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2111 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2112 ; GFX8-NEXT:  buffer_wbinvl1_vol
2113 ; GFX10-NEXT: buffer_gl0_inv
2114 ; GFX10-NEXT: buffer_gl1_inv
2115 ; GFX10:         .amdhsa_kernel agent_acq_rel_monotonic
2116 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2117 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2118 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2119 define amdgpu_kernel void @agent_acq_rel_monotonic(
2120     i32* %out, i32 %in, i32 %old) {
2121 entry:
2122   %gep = getelementptr i32, i32* %out, i32 4
2123   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
2124   ret void
2125 }
2126
2127 ; GCN-LABEL: {{^}}agent_seq_cst_monotonic:
2128 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2129 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2130 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2131 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2132 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2133 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2134 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2135 ; GFX8-NEXT:  buffer_wbinvl1_vol
2136 ; GFX10-NEXT: buffer_gl0_inv
2137 ; GFX10-NEXT: buffer_gl1_inv
2138 ; GFX10:         .amdhsa_kernel agent_seq_cst_monotonic
2139 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2140 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2141 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2142 define amdgpu_kernel void @agent_seq_cst_monotonic(
2143     i32* %out, i32 %in, i32 %old) {
2144 entry:
2145   %gep = getelementptr i32, i32* %out, i32 4
2146   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
2147   ret void
2148 }
2149
2150 ; GCN-LABEL: {{^}}agent_acquire_acquire:
2151 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
2152 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2153 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2154 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2155 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2156 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2157 ; GFX8-NEXT:  buffer_wbinvl1_vol
2158 ; GFX10-NEXT: buffer_gl0_inv
2159 ; GFX10-NEXT: buffer_gl1_inv
2160 ; GFX10:         .amdhsa_kernel agent_acquire_acquire
2161 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2162 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2163 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2164 define amdgpu_kernel void @agent_acquire_acquire(
2165     i32* %out, i32 %in, i32 %old) {
2166 entry:
2167   %gep = getelementptr i32, i32* %out, i32 4
2168   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
2169   ret void
2170 }
2171
2172 ; GCN-LABEL: {{^}}agent_release_acquire:
2173 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2174 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2175 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2176 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2177 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2178 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2179 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2180 ; GFX8-NEXT:  buffer_wbinvl1_vol
2181 ; GFX10-NEXT: buffer_gl0_inv
2182 ; GFX10-NEXT: buffer_gl1_inv
2183 ; GFX10:         .amdhsa_kernel agent_release_acquire
2184 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2185 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2186 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2187 define amdgpu_kernel void @agent_release_acquire(
2188     i32* %out, i32 %in, i32 %old) {
2189 entry:
2190   %gep = getelementptr i32, i32* %out, i32 4
2191   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire
2192   ret void
2193 }
2194
2195 ; GCN-LABEL: {{^}}agent_acq_rel_acquire:
2196 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2197 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2198 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2199 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2200 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2201 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2202 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2203 ; GFX8-NEXT:  buffer_wbinvl1_vol
2204 ; GFX10-NEXT: buffer_gl0_inv
2205 ; GFX10-NEXT: buffer_gl1_inv
2206 ; GFX10:         .amdhsa_kernel agent_acq_rel_acquire
2207 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2208 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2209 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2210 define amdgpu_kernel void @agent_acq_rel_acquire(
2211     i32* %out, i32 %in, i32 %old) {
2212 entry:
2213   %gep = getelementptr i32, i32* %out, i32 4
2214   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
2215   ret void
2216 }
2217
2218 ; GCN-LABEL: {{^}}agent_seq_cst_acquire:
2219 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2220 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2221 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2222 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2223 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2224 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2225 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2226 ; GFX8-NEXT:  buffer_wbinvl1_vol
2227 ; GFX10-NEXT: buffer_gl0_inv
2228 ; GFX10-NEXT: buffer_gl1_inv
2229 ; GFX10:         .amdhsa_kernel agent_seq_cst_acquire
2230 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2231 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2232 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2233 define amdgpu_kernel void @agent_seq_cst_acquire(
2234     i32* %out, i32 %in, i32 %old) {
2235 entry:
2236   %gep = getelementptr i32, i32* %out, i32 4
2237   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
2238   ret void
2239 }
2240
2241 ; GCN-LABEL: {{^}}agent_seq_cst_seq_cst:
2242 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2243 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2244 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2245 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2246 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2247 ; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
2248 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2249 ; GFX8-NEXT:  buffer_wbinvl1_vol
2250 ; GFX10-NEXT: buffer_gl0_inv
2251 ; GFX10-NEXT: buffer_gl1_inv
2252 ; GFX10:         .amdhsa_kernel agent_seq_cst_seq_cst
2253 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2254 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2255 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2256 define amdgpu_kernel void @agent_seq_cst_seq_cst(
2257     i32* %out, i32 %in, i32 %old) {
2258 entry:
2259   %gep = getelementptr i32, i32* %out, i32 4
2260   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
2261   ret void
2262 }
2263
2264 ; GCN-LABEL: {{^}}workgroup_monotonic_monotonic:
2265 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2266 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2267 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2268 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2269 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2270 ; GFX8-NOT:  buffer_wbinvl1_vol
2271 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2272 ; GFX10:         .amdhsa_kernel workgroup_monotonic_monotonic
2273 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2274 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2275 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2276 define amdgpu_kernel void @workgroup_monotonic_monotonic(
2277     i32* %out, i32 %in, i32 %old) {
2278 entry:
2279   %gep = getelementptr i32, i32* %out, i32 4
2280   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
2281   ret void
2282 }
2283
2284 ; GCN-LABEL:     {{^}}workgroup_acquire_monotonic:
2285 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
2286 ; GFX10-NOT:     s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2287 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2288 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2289 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2290 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2291 ; GFX10WGP-NEXT: buffer_gl0_inv
2292 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2293 ; GFX10CU-NOT:   buffer_gl0_inv
2294 ; GFX8-NOT:      buffer_wbinvl1_vol
2295 ; GFX10:         .amdhsa_kernel workgroup_acquire_monotonic
2296 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2297 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2298 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2299 define amdgpu_kernel void @workgroup_acquire_monotonic(
2300     i32* %out, i32 %in, i32 %old) {
2301 entry:
2302   %gep = getelementptr i32, i32* %out, i32 4
2303   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
2304   ret void
2305 }
2306
2307 ; GCN-LABEL:     {{^}}workgroup_release_monotonic:
2308 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2309 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2310 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2311 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
2312 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2313 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2314 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
2315 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2316 ; GCN-NOT:       buffer_{{wbinvl1_vol|gl._inv}}
2317 ; GFX10:         .amdhsa_kernel workgroup_release_monotonic
2318 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2319 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2320 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2321 define amdgpu_kernel void @workgroup_release_monotonic(
2322     i32* %out, i32 %in, i32 %old) {
2323 entry:
2324   %gep = getelementptr i32, i32* %out, i32 4
2325   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
2326   ret void
2327 }
2328
2329 ; GCN-LABEL:     {{^}}workgroup_acq_rel_monotonic:
2330 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2331 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2332 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2333 ; GFX10CU-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2334 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2335 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2336 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2337 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2338 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2339 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2340 ; GFX8-NOT:      buffer_wbinvl1_vol
2341 ; GFX10WGP-NEXT: buffer_gl0_inv
2342 ; GFX10CU-NOT:   buffer_gl0_inv
2343 ; GFX10:         .amdhsa_kernel workgroup_acq_rel_monotonic
2344 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2345 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2346 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2347 define amdgpu_kernel void @workgroup_acq_rel_monotonic(
2348     i32* %out, i32 %in, i32 %old) {
2349 entry:
2350   %gep = getelementptr i32, i32* %out, i32 4
2351   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
2352   ret void
2353 }
2354
2355 ; GCN-LABEL:     {{^}}workgroup_seq_cst_monotonic:
2356 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2357 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2358 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2359 ; GFX10CU-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2360 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2361 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2362 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2363 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2364 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2365 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2366 ; GFX8-NOT:      buffer_wbinvl1_vol
2367 ; GFX10WGP-NEXT: buffer_gl0_inv
2368 ; GFX10CU-NOT:   buffer_gl0_inv
2369 ; GFX10:         .amdhsa_kernel workgroup_seq_cst_monotonic
2370 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2371 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2372 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2373 define amdgpu_kernel void @workgroup_seq_cst_monotonic(
2374     i32* %out, i32 %in, i32 %old) {
2375 entry:
2376   %gep = getelementptr i32, i32* %out, i32 4
2377   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
2378   ret void
2379 }
2380
2381 ; GCN-LABEL:     {{^}}workgroup_acquire_acquire:
2382 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
2383 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2384 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2385 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2386 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2387 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2388 ; GFX10WGP-NEXT: buffer_gl0_inv
2389 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2390 ; GFX10CU-NOT:   buffer_gl0_inv
2391 ; GFX8-NOT:      buffer_wbinvl1_vol
2392 ; GFX10:         .amdhsa_kernel workgroup_acquire_acquire
2393 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2394 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2395 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2396 define amdgpu_kernel void @workgroup_acquire_acquire(
2397     i32* %out, i32 %in, i32 %old) {
2398 entry:
2399   %gep = getelementptr i32, i32* %out, i32 4
2400   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
2401   ret void
2402 }
2403
2404 ; GCN-LABEL:     {{^}}workgroup_release_acquire:
2405 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2406 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2407 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2408 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
2409 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2410 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2411 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2412 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2413 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2414 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2415 ; GFX8-NOT:      buffer_wbinvl1_vol
2416 ; GFX10WGP-NEXT: buffer_gl0_inv
2417 ; GFX10CU-NOT:   buffer_gl0_inv
2418 ; GFX10:         .amdhsa_kernel workgroup_release_acquire
2419 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2420 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2421 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2422 define amdgpu_kernel void @workgroup_release_acquire(
2423     i32* %out, i32 %in, i32 %old) {
2424 entry:
2425   %gep = getelementptr i32, i32* %out, i32 4
2426   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
2427   ret void
2428 }
2429
2430 ; GCN-LABEL:     {{^}}workgroup_acq_rel_acquire:
2431 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2432 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2433 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2434 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
2435 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2436 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2437 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2438 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2439 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2440 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2441 ; GFX8-NOT:      buffer_wbinvl1_vol
2442 ; GFX10WGP:      buffer_gl0_inv
2443 ; GFX10CU-NOT:   buffer_gl0_inv
2444 ; GFX10:         .amdhsa_kernel workgroup_acq_rel_acquire
2445 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2446 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2447 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2448 define amdgpu_kernel void @workgroup_acq_rel_acquire(
2449     i32* %out, i32 %in, i32 %old) {
2450 entry:
2451   %gep = getelementptr i32, i32* %out, i32 4
2452   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
2453   ret void
2454 }
2455
2456 ; GCN-LABEL:     {{^}}workgroup_seq_cst_acquire:
2457 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2458 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2459 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2460 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
2461 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2462 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2463 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2464 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2465 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2466 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2467 ; GFX8-NOT:      buffer_wbinvl1_vol
2468 ; GFX10WGP-NEXT: buffer_gl0_inv
2469 ; GFX10CU-NOT:   buffer_gl0_inv
2470 ; GFX10:         .amdhsa_kernel workgroup_seq_cst_acquire
2471 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2472 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2473 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2474 define amdgpu_kernel void @workgroup_seq_cst_acquire(
2475     i32* %out, i32 %in, i32 %old) {
2476 entry:
2477   %gep = getelementptr i32, i32* %out, i32 4
2478   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
2479   ret void
2480 }
2481
2482 ; GCN-LABEL:     {{^}}workgroup_seq_cst_seq_cst:
2483 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2484 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2485 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2486 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
2487 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2488 ; GCN:           flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2489 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
2490 ; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}}
2491 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
2492 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
2493 ; GFX8-NOT:      buffer_wbinvl1_vol
2494 ; GFX10WGP:      buffer_gl0_inv
2495 ; GFX10CU-NOT:   buffer_gl0_inv
2496 ; GFX10:         .amdhsa_kernel workgroup_seq_cst_seq_cst
2497 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2498 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2499 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2500 define amdgpu_kernel void @workgroup_seq_cst_seq_cst(
2501     i32* %out, i32 %in, i32 %old) {
2502 entry:
2503   %gep = getelementptr i32, i32* %out, i32 4
2504   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
2505   ret void
2506 }
2507
2508 ; GCN-LABEL: {{^}}wavefront_monotonic_monotonic:
2509 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2510 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2511 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2512 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2513 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2514 ; GFX8-NOT:  buffer_wbinvl1_vol
2515 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2516 ; GFX10:         .amdhsa_kernel wavefront_monotonic_monotonic
2517 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2518 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2519 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2520 define amdgpu_kernel void @wavefront_monotonic_monotonic(
2521     i32* %out, i32 %in, i32 %old) {
2522 entry:
2523   %gep = getelementptr i32, i32* %out, i32 4
2524   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
2525   ret void
2526 }
2527
2528 ; GCN-LABEL: {{^}}wavefront_acquire_monotonic:
2529 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2530 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2531 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2532 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2533 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2534 ; GFX8-NOT:  buffer_wbinvl1_vol
2535 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2536 ; GFX10:         .amdhsa_kernel wavefront_acquire_monotonic
2537 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2538 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2539 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2540 define amdgpu_kernel void @wavefront_acquire_monotonic(
2541     i32* %out, i32 %in, i32 %old) {
2542 entry:
2543   %gep = getelementptr i32, i32* %out, i32 4
2544   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
2545   ret void
2546 }
2547
2548 ; GCN-LABEL: {{^}}wavefront_release_monotonic:
2549 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2550 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2551 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2552 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2553 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2554 ; GFX8-NOT:  buffer_wbinvl1_vol
2555 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2556 ; GFX10:         .amdhsa_kernel wavefront_release_monotonic
2557 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2558 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2559 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2560 define amdgpu_kernel void @wavefront_release_monotonic(
2561     i32* %out, i32 %in, i32 %old) {
2562 entry:
2563   %gep = getelementptr i32, i32* %out, i32 4
2564   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
2565   ret void
2566 }
2567
2568 ; GCN-LABEL: {{^}}wavefront_acq_rel_monotonic:
2569 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2570 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2571 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2572 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2573 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2574 ; GFX8-NOT:  buffer_wbinvl1_vol
2575 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2576 ; GFX10:         .amdhsa_kernel wavefront_acq_rel_monotonic
2577 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2578 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2579 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2580 define amdgpu_kernel void @wavefront_acq_rel_monotonic(
2581     i32* %out, i32 %in, i32 %old) {
2582 entry:
2583   %gep = getelementptr i32, i32* %out, i32 4
2584   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
2585   ret void
2586 }
2587
2588 ; GCN-LABEL: {{^}}wavefront_seq_cst_monotonic:
2589 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2590 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2591 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2592 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2593 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2594 ; GFX8-NOT:  buffer_wbinvl1_vol
2595 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2596 ; GFX10:         .amdhsa_kernel wavefront_seq_cst_monotonic
2597 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2598 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2599 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2600 define amdgpu_kernel void @wavefront_seq_cst_monotonic(
2601     i32* %out, i32 %in, i32 %old) {
2602 entry:
2603   %gep = getelementptr i32, i32* %out, i32 4
2604   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
2605   ret void
2606 }
2607
2608 ; GCN-LABEL: {{^}}wavefront_acquire_acquire:
2609 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2610 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2611 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2612 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2613 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2614 ; GFX8-NOT:  buffer_wbinvl1_vol
2615 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2616 ; GFX10:         .amdhsa_kernel wavefront_acquire_acquire
2617 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2618 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2619 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2620 define amdgpu_kernel void @wavefront_acquire_acquire(
2621     i32* %out, i32 %in, i32 %old) {
2622 entry:
2623   %gep = getelementptr i32, i32* %out, i32 4
2624   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
2625   ret void
2626 }
2627
2628 ; GCN-LABEL: {{^}}wavefront_release_acquire:
2629 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2630 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2631 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2632 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2633 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2634 ; GFX8-NOT:  buffer_wbinvl1_vol
2635 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2636 ; GFX10:         .amdhsa_kernel wavefront_release_acquire
2637 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2638 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2639 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2640 define amdgpu_kernel void @wavefront_release_acquire(
2641     i32* %out, i32 %in, i32 %old) {
2642 entry:
2643   %gep = getelementptr i32, i32* %out, i32 4
2644   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
2645   ret void
2646 }
2647
2648 ; GCN-LABEL: {{^}}wavefront_acq_rel_acquire:
2649 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2650 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2651 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2652 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2653 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2654 ; GFX8-NOT:  buffer_wbinvl1_vol
2655 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2656 ; GFX10:         .amdhsa_kernel wavefront_acq_rel_acquire
2657 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2658 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2659 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2660 define amdgpu_kernel void @wavefront_acq_rel_acquire(
2661     i32* %out, i32 %in, i32 %old) {
2662 entry:
2663   %gep = getelementptr i32, i32* %out, i32 4
2664   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
2665   ret void
2666 }
2667
2668 ; GCN-LABEL: {{^}}wavefront_seq_cst_acquire:
2669 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2670 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2671 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2672 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2673 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2674 ; GFX8-NOT:  buffer_wbinvl1_vol
2675 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2676 ; GFX10:         .amdhsa_kernel wavefront_seq_cst_acquire
2677 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2678 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2679 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2680 define amdgpu_kernel void @wavefront_seq_cst_acquire(
2681     i32* %out, i32 %in, i32 %old) {
2682 entry:
2683   %gep = getelementptr i32, i32* %out, i32 4
2684   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
2685   ret void
2686 }
2687
2688 ; GCN-LABEL: {{^}}wavefront_seq_cst_seq_cst:
2689 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2690 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2691 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
2692 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
2693 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2694 ; GFX8-NOT:  buffer_wbinvl1_vol
2695 ; GFX10-NOT: buffer_gl{{[01]}}._inv
2696 ; GFX10:         .amdhsa_kernel wavefront_seq_cst_seq_cst
2697 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2698 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2699 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2700 define amdgpu_kernel void @wavefront_seq_cst_seq_cst(
2701     i32* %out, i32 %in, i32 %old) {
2702 entry:
2703   %gep = getelementptr i32, i32* %out, i32 4
2704   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
2705   ret void
2706 }
2707
2708 ; GCN-LABEL: {{^}}system_acquire_monotonic_ret:
2709 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
2710 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2711 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2712 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2713 ; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2714 ; GFX8-NEXT:  buffer_wbinvl1_vol
2715 ; GFX10-NEXT: buffer_gl0_inv
2716 ; GFX10-NEXT: buffer_gl1_inv
2717 ; GFX10:         .amdhsa_kernel system_acquire_monotonic_ret
2718 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2719 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2720 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2721 define amdgpu_kernel void @system_acquire_monotonic_ret(
2722     i32* %out, i32 %in, i32 %old) {
2723 entry:
2724   %gep = getelementptr i32, i32* %out, i32 4
2725   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic
2726   %val0 = extractvalue { i32, i1 } %val, 0
2727   store i32 %val0, i32* %out, align 4
2728   ret void
2729 }
2730
2731 ; GCN-LABEL: {{^}}system_acq_rel_monotonic_ret:
2732 ; GCN:        s_waitcnt lgkmcnt(0){{$}}
2733 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2734 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2735 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2736 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2737 ; GFX8-NEXT:  buffer_wbinvl1_vol
2738 ; GFX10-NEXT: buffer_gl0_inv
2739 ; GFX10-NEXT: buffer_gl1_inv
2740 ; GFX10:         .amdhsa_kernel system_acq_rel_monotonic_ret
2741 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2742 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2743 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2744 define amdgpu_kernel void @system_acq_rel_monotonic_ret(
2745     i32* %out, i32 %in, i32 %old) {
2746 entry:
2747   %gep = getelementptr i32, i32* %out, i32 4
2748   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic
2749   %val0 = extractvalue { i32, i1 } %val, 0
2750   store i32 %val0, i32* %out, align 4
2751   ret void
2752 }
2753
2754 ; GCN-LABEL: {{^}}system_seq_cst_monotonic_ret:
2755 ; GCN:        s_waitcnt lgkmcnt(0){{$}}
2756 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2757 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2758 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2759 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2760 ; GFX8-NEXT:  buffer_wbinvl1_vol
2761 ; GFX10-NEXT: buffer_gl0_inv
2762 ; GFX10-NEXT: buffer_gl1_inv
2763 ; GFX10:         .amdhsa_kernel system_seq_cst_monotonic_ret
2764 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2765 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2766 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2767 define amdgpu_kernel void @system_seq_cst_monotonic_ret(
2768     i32* %out, i32 %in, i32 %old) {
2769 entry:
2770   %gep = getelementptr i32, i32* %out, i32 4
2771   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic
2772   %val0 = extractvalue { i32, i1 } %val, 0
2773   store i32 %val0, i32* %out, align 4
2774   ret void
2775 }
2776
2777 ; GCN-LABEL: {{^}}system_acquire_acquire_ret:
2778 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
2779 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2780 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2781 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2782 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2783 ; GFX8-NEXT:  buffer_wbinvl1_vol
2784 ; GFX10-NEXT: buffer_gl0_inv
2785 ; GFX10-NEXT: buffer_gl1_inv
2786 ; GFX10:         .amdhsa_kernel system_acquire_acquire_ret
2787 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2788 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2789 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2790 define amdgpu_kernel void @system_acquire_acquire_ret(
2791     i32* %out, i32 %in, i32 %old) {
2792 entry:
2793   %gep = getelementptr i32, i32* %out, i32 4
2794   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire
2795   %val0 = extractvalue { i32, i1 } %val, 0
2796   store i32 %val0, i32* %out, align 4
2797   ret void
2798 }
2799
2800 ; GCN-LABEL: {{^}}system_release_acquire_ret:
2801 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2802 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2803 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2804 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2805 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2806 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2807 ; GFX8-NEXT:  buffer_wbinvl1_vol
2808 ; GFX10-NEXT: buffer_gl0_inv
2809 ; GFX10-NEXT: buffer_gl1_inv
2810 ; GFX10:         .amdhsa_kernel system_release_acquire_ret
2811 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2812 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2813 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2814 define amdgpu_kernel void @system_release_acquire_ret(
2815     i32* %out, i32 %in, i32 %old) {
2816 entry:
2817   %gep = getelementptr i32, i32* %out, i32 4
2818   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire
2819   %val0 = extractvalue { i32, i1 } %val, 0
2820   store i32 %val0, i32* %out, align 4
2821   ret void
2822 }
2823
2824 ; GCN-LABEL: {{^}}system_acq_rel_acquire_ret:
2825 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2826 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2827 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2828 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2829 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2830 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2831 ; GFX8-NEXT:  buffer_wbinvl1_vol
2832 ; GFX10-NEXT: buffer_gl0_inv
2833 ; GFX10-NEXT: buffer_gl1_inv
2834 ; GFX10:         .amdhsa_kernel system_acq_rel_acquire_ret
2835 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2836 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2837 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2838 define amdgpu_kernel void @system_acq_rel_acquire_ret(
2839     i32* %out, i32 %in, i32 %old) {
2840 entry:
2841   %gep = getelementptr i32, i32* %out, i32 4
2842   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire
2843   %val0 = extractvalue { i32, i1 } %val, 0
2844   store i32 %val0, i32* %out, align 4
2845   ret void
2846 }
2847
2848 ; GCN-LABEL: {{^}}system_seq_cst_acquire_ret:
2849 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2850 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2851 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2852 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2853 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2854 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2855 ; GFX8-NEXT:  buffer_wbinvl1_vol
2856 ; GFX10-NEXT: buffer_gl0_inv
2857 ; GFX10-NEXT: buffer_gl1_inv
2858 ; GFX10:         .amdhsa_kernel system_seq_cst_acquire_ret
2859 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2860 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2861 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2862 define amdgpu_kernel void @system_seq_cst_acquire_ret(
2863     i32* %out, i32 %in, i32 %old) {
2864 entry:
2865   %gep = getelementptr i32, i32* %out, i32 4
2866   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire
2867   %val0 = extractvalue { i32, i1 } %val, 0
2868   store i32 %val0, i32* %out, align 4
2869   ret void
2870 }
2871
2872 ; GCN-LABEL: {{^}}system_seq_cst_seq_cst_ret:
2873 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2874 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2875 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2876 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2877 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2878 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2879 ; GFX8-NEXT:  buffer_wbinvl1_vol
2880 ; GFX10-NEXT: buffer_gl0_inv
2881 ; GFX10-NEXT: buffer_gl1_inv
2882 ; GFX10:         .amdhsa_kernel system_seq_cst_seq_cst_ret
2883 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2884 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2885 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2886 define amdgpu_kernel void @system_seq_cst_seq_cst_ret(
2887     i32* %out, i32 %in, i32 %old) {
2888 entry:
2889   %gep = getelementptr i32, i32* %out, i32 4
2890   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
2891   %val0 = extractvalue { i32, i1 } %val, 0
2892   store i32 %val0, i32* %out, align 4
2893   ret void
2894 }
2895
2896 ; GCN-LABEL: {{^}}agent_acquire_monotonic_ret:
2897 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
2898 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2899 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2900 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2901 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2902 ; GFX8-NEXT:  buffer_wbinvl1_vol
2903 ; GFX10-NEXT: buffer_gl0_inv
2904 ; GFX10-NEXT: buffer_gl1_inv
2905 ; GFX10:         .amdhsa_kernel agent_acquire_monotonic_ret
2906 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2907 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2908 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2909 define amdgpu_kernel void @agent_acquire_monotonic_ret(
2910     i32* %out, i32 %in, i32 %old) {
2911 entry:
2912   %gep = getelementptr i32, i32* %out, i32 4
2913   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
2914   %val0 = extractvalue { i32, i1 } %val, 0
2915   store i32 %val0, i32* %out, align 4
2916   ret void
2917 }
2918
2919 ; GCN-LABEL: {{^}}agent_acq_rel_monotonic_ret:
2920 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2921 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2922 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2923 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2924 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2925 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2926 ; GFX8-NEXT:  buffer_wbinvl1_vol
2927 ; GFX10-NEXT: buffer_gl0_inv
2928 ; GFX10-NEXT: buffer_gl1_inv
2929 ; GFX10:         .amdhsa_kernel agent_acq_rel_monotonic_ret
2930 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2931 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2932 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2933 define amdgpu_kernel void @agent_acq_rel_monotonic_ret(
2934     i32* %out, i32 %in, i32 %old) {
2935 entry:
2936   %gep = getelementptr i32, i32* %out, i32 4
2937   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
2938   %val0 = extractvalue { i32, i1 } %val, 0
2939   store i32 %val0, i32* %out, align 4
2940   ret void
2941 }
2942
2943 ; GCN-LABEL: {{^}}agent_seq_cst_monotonic_ret:
2944 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2945 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2946 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2947 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2948 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2949 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2950 ; GFX8-NEXT:  buffer_wbinvl1_vol
2951 ; GFX10-NEXT: buffer_gl0_inv
2952 ; GFX10-NEXT: buffer_gl1_inv
2953 ; GFX10:         .amdhsa_kernel agent_seq_cst_monotonic_ret
2954 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2955 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2956 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2957 define amdgpu_kernel void @agent_seq_cst_monotonic_ret(
2958     i32* %out, i32 %in, i32 %old) {
2959 entry:
2960   %gep = getelementptr i32, i32* %out, i32 4
2961   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
2962   %val0 = extractvalue { i32, i1 } %val, 0
2963   store i32 %val0, i32* %out, align 4
2964   ret void
2965 }
2966
2967 ; GCN-LABEL: {{^}}agent_acquire_acquire_ret:
2968 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
2969 ; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
2970 ; GCN:        flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2971 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2972 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2973 ; GFX8-NEXT:  buffer_wbinvl1_vol
2974 ; GFX10-NEXT: buffer_gl0_inv
2975 ; GFX10-NEXT: buffer_gl1_inv
2976 ; GFX10:         .amdhsa_kernel agent_acquire_acquire_ret
2977 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
2978 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
2979 ; GFX10-NOT:     .amdhsa_memory_ordered 0
2980 define amdgpu_kernel void @agent_acquire_acquire_ret(
2981     i32* %out, i32 %in, i32 %old) {
2982 entry:
2983   %gep = getelementptr i32, i32* %out, i32 4
2984   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
2985   %val0 = extractvalue { i32, i1 } %val, 0
2986   store i32 %val0, i32* %out, align 4
2987   ret void
2988 }
2989
2990 ; GCN-LABEL: {{^}}agent_release_acquire_ret:
2991 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2992 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
2993 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
2994 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
2995 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2996 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
2997 ; GFX8-NEXT:  buffer_wbinvl1_vol
2998 ; GFX10-NEXT: buffer_gl0_inv
2999 ; GFX10-NEXT: buffer_gl1_inv
3000 ; GFX10:         .amdhsa_kernel agent_release_acquire_ret
3001 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3002 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3003 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3004 define amdgpu_kernel void @agent_release_acquire_ret(
3005     i32* %out, i32 %in, i32 %old) {
3006 entry:
3007   %gep = getelementptr i32, i32* %out, i32 4
3008   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire
3009   %val0 = extractvalue { i32, i1 } %val, 0
3010   store i32 %val0, i32* %out, align 4
3011   ret void
3012 }
3013
3014 ; GCN-LABEL: {{^}}agent_acq_rel_acquire_ret:
3015 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3016 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
3017 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
3018 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3019 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3020 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3021 ; GFX8-NEXT:  buffer_wbinvl1_vol
3022 ; GFX10-NEXT: buffer_gl0_inv
3023 ; GFX10-NEXT: buffer_gl1_inv
3024 ; GFX10:         .amdhsa_kernel agent_acq_rel_acquire_ret
3025 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3026 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3027 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3028 define amdgpu_kernel void @agent_acq_rel_acquire_ret(
3029     i32* %out, i32 %in, i32 %old) {
3030 entry:
3031   %gep = getelementptr i32, i32* %out, i32 4
3032   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
3033   %val0 = extractvalue { i32, i1 } %val, 0
3034   store i32 %val0, i32* %out, align 4
3035   ret void
3036 }
3037
3038 ; GCN-LABEL: {{^}}agent_seq_cst_acquire_ret:
3039 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3040 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
3041 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
3042 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3043 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3044 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3045 ; GFX8-NEXT:  buffer_wbinvl1_vol
3046 ; GFX10-NEXT: buffer_gl0_inv
3047 ; GFX10-NEXT: buffer_gl1_inv
3048 ; GFX10:         .amdhsa_kernel agent_seq_cst_acquire_ret
3049 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3050 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3051 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3052 define amdgpu_kernel void @agent_seq_cst_acquire_ret(
3053     i32* %out, i32 %in, i32 %old) {
3054 entry:
3055   %gep = getelementptr i32, i32* %out, i32 4
3056   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
3057   %val0 = extractvalue { i32, i1 } %val, 0
3058   store i32 %val0, i32* %out, align 4
3059   ret void
3060 }
3061
3062 ; GCN-LABEL: {{^}}agent_seq_cst_seq_cst_ret:
3063 ; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3064 ; GFX10:      s_waitcnt lgkmcnt(0){{$}}
3065 ; GFX10:      s_waitcnt_vscnt null, 0x0{{$}}
3066 ; GCN-NEXT:   flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3067 ; GFX8-NEXT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3068 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3069 ; GFX8-NEXT:  buffer_wbinvl1_vol
3070 ; GFX10-NEXT: buffer_gl0_inv
3071 ; GFX10-NEXT: buffer_gl1_inv
3072 ; GFX10:         .amdhsa_kernel agent_seq_cst_seq_cst_ret
3073 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3074 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3075 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3076 define amdgpu_kernel void @agent_seq_cst_seq_cst_ret(
3077     i32* %out, i32 %in, i32 %old) {
3078 entry:
3079   %gep = getelementptr i32, i32* %out, i32 4
3080   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
3081   %val0 = extractvalue { i32, i1 } %val, 0
3082   store i32 %val0, i32* %out, align 4
3083   ret void
3084 }
3085
3086 ; GCN-LABEL:     {{^}}workgroup_acquire_monotonic_ret:
3087 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
3088 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
3089 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3090 ; GFX8:          s_waitcnt vmcnt(0){{$}}
3091 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3092 ; GFX10WGP-NEXT: buffer_gl0_inv
3093 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3094 ; GFX10CU-NOT:   buffer_gl0_inv
3095 ; GFX8-NOT:      buffer_wbinvl1_vol
3096 ; GFX10:         .amdhsa_kernel workgroup_acquire_monotonic_ret
3097 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3098 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3099 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3100 define amdgpu_kernel void @workgroup_acquire_monotonic_ret(
3101     i32* %out, i32 %in, i32 %old) {
3102 entry:
3103   %gep = getelementptr i32, i32* %out, i32 4
3104   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
3105   %val0 = extractvalue { i32, i1 } %val, 0
3106   store i32 %val0, i32* %out, align 4
3107   ret void
3108 }
3109
3110 ; GCN-LABEL:     {{^}}workgroup_acq_rel_monotonic_ret:
3111 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
3112 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3113 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
3114 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
3115 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
3116 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3117 ; GFX8:          s_waitcnt vmcnt(0){{$}}
3118 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3119 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3120 ; GFX8-NOT:      buffer_wbinvl1_vol
3121 ; GFX10WGP-NEXT: buffer_gl0_inv
3122 ; GFX10CU-NOT:   buffer_gl0_inv
3123 ; GFX10:         .amdhsa_kernel workgroup_acq_rel_monotonic_ret
3124 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3125 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3126 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3127 define amdgpu_kernel void @workgroup_acq_rel_monotonic_ret(
3128     i32* %out, i32 %in, i32 %old) {
3129 entry:
3130   %gep = getelementptr i32, i32* %out, i32 4
3131   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
3132   %val0 = extractvalue { i32, i1 } %val, 0
3133   store i32 %val0, i32* %out, align 4
3134   ret void
3135 }
3136
3137 ; GCN-LABEL:     {{^}}workgroup_seq_cst_monotonic_ret:
3138 ; GFX8-NOT:      s_waitcnt vmcnt(0){{$}}
3139 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3140 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
3141 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
3142 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
3143 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3144 ; GFX8:          s_waitcnt vmcnt(0){{$}}
3145 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3146 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3147 ; GFX8-NOT:      buffer_wbinvl1_vol
3148 ; GFX10WGP-NEXT: buffer_gl0_inv
3149 ; GFX10CU-NOT:   buffer_gl0_inv
3150 ; GFX10:         .amdhsa_kernel workgroup_seq_cst_monotonic_ret
3151 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3152 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3153 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3154 define amdgpu_kernel void @workgroup_seq_cst_monotonic_ret(
3155     i32* %out, i32 %in, i32 %old) {
3156 entry:
3157   %gep = getelementptr i32, i32* %out, i32 4
3158   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
3159   %val0 = extractvalue { i32, i1 } %val, 0
3160   store i32 %val0, i32* %out, align 4
3161   ret void
3162 }
3163
3164 ; GCN-LABEL:     {{^}}workgroup_acquire_acquire_ret:
3165 ; GCN-NOT:       s_waitcnt vmcnt(0){{$}}
3166 ; GCN-NOT:       s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
3167 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3168 ; GFX8:          s_waitcnt vmcnt(0){{$}}
3169 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3170 ; GFX10WGP-NEXT: buffer_gl0_inv
3171 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3172 ; GFX10CU-NOT:   buffer_gl0_inv
3173 ; GFX8-NOT:      buffer_wbinvl1_vol
3174 ; GFX10:         .amdhsa_kernel workgroup_acquire_acquire_ret
3175 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3176 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3177 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3178 define amdgpu_kernel void @workgroup_acquire_acquire_ret(
3179     i32* %out, i32 %in, i32 %old) {
3180 entry:
3181   %gep = getelementptr i32, i32* %out, i32 4
3182   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
3183   %val0 = extractvalue { i32, i1 } %val, 0
3184   store i32 %val0, i32* %out, align 4
3185   ret void
3186 }
3187
3188 ; GCN-LABEL:     {{^}}workgroup_release_acquire_ret:
3189 ; GFX8:          s_waitcnt lgkmcnt(0){{$}}
3190 ; GFX8:          flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3191 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3192 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
3193 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
3194 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
3195 ; GFX10:         flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3196 ; GFX8:          s_waitcnt vmcnt(0){{$}}
3197 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3198 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3199 ; GFX8-NOT:      buffer_wbinvl1_vol
3200 ; GFX10WGP-NEXT: buffer_gl0_inv
3201 ; GFX10CU-NOT:   buffer_gl0_inv
3202 ; GFX10:         .amdhsa_kernel workgroup_release_acquire_ret
3203 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3204 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3205 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3206 define amdgpu_kernel void @workgroup_release_acquire_ret(
3207     i32* %out, i32 %in, i32 %old) {
3208 entry:
3209   %gep = getelementptr i32, i32* %out, i32 4
3210   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
3211   %val0 = extractvalue { i32, i1 } %val, 0
3212   store i32 %val0, i32* %out, align 4
3213   ret void
3214 }
3215
3216 ; GCN-LABEL:     {{^}}workgroup_acq_rel_acquire_ret:
3217 ; GFX8:          s_waitcnt lgkmcnt(0){{$}}
3218 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3219 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
3220 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
3221 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
3222 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3223 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3224 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3225 ; GFX8-NOT:      buffer_wbinvl1_vol
3226 ; GFX10WGP:      buffer_gl0_inv
3227 ; GFX10CU-NOT:   buffer_gl0_inv
3228 ; GFX10:         .amdhsa_kernel workgroup_acq_rel_acquire_ret
3229 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3230 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3231 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3232 define amdgpu_kernel void @workgroup_acq_rel_acquire_ret(
3233     i32* %out, i32 %in, i32 %old) {
3234 entry:
3235   %gep = getelementptr i32, i32* %out, i32 4
3236   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
3237   %val0 = extractvalue { i32, i1 } %val, 0
3238   store i32 %val0, i32* %out, align 4
3239   ret void
3240 }
3241
3242 ; GCN-LABEL:     {{^}}workgroup_seq_cst_acquire_ret:
3243 ; GFX8:          s_waitcnt lgkmcnt(0){{$}}
3244 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3245 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
3246 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
3247 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
3248 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3249 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3250 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3251 ; GFX8-NOT:      buffer_wbinvl1_vol
3252 ; GFX10WGP-NEXT: buffer_gl0_inv
3253 ; GFX10CU-NOT:   buffer_gl0_inv
3254 ; GFX10:         .amdhsa_kernel workgroup_seq_cst_acquire_ret
3255 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3256 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3257 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3258 define amdgpu_kernel void @workgroup_seq_cst_acquire_ret(
3259     i32* %out, i32 %in, i32 %old) {
3260 entry:
3261   %gep = getelementptr i32, i32* %out, i32 4
3262   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
3263   %val0 = extractvalue { i32, i1 } %val, 0
3264   store i32 %val0, i32* %out, align 4
3265   ret void
3266 }
3267
3268 ; GCN-LABEL:     {{^}}workgroup_seq_cst_seq_cst_ret:
3269 ; GFX8:          s_waitcnt lgkmcnt(0){{$}}
3270 ; GFX10WGP:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3271 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
3272 ; GFX10CU-NOT:   s_waitcnt vmcnt(0){{$}}
3273 ; GFX10CU-NOT:   s_waitcnt_vscnt null, 0x0{{$}}
3274 ; GCN:           flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}}
3275 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
3276 ; GFX10CU:       s_waitcnt vmcnt(0){{$}}
3277 ; GFX8-NOT:      buffer_wbinvl1_vol
3278 ; GFX10WGP:      buffer_gl0_inv
3279 ; GFX10CU-NOT:   buffer_gl0_inv
3280 ; GFX10:         .amdhsa_kernel workgroup_seq_cst_seq_cst_ret
3281 ; GFX10WGP-NOT:  .amdhsa_workgroup_processor_mode 0
3282 ; GFX10CU:       .amdhsa_workgroup_processor_mode 0
3283 ; GFX10-NOT:     .amdhsa_memory_ordered 0
3284 define amdgpu_kernel void @workgroup_seq_cst_seq_cst_ret(
3285     i32* %out, i32 %in, i32 %old) {
3286 entry:
3287   %gep = getelementptr i32, i32* %out, i32 4
3288   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
3289   %val0 = extractvalue { i32, i1 } %val, 0
3290   store i32 %val0, i32* %out, align 4
3291   ret void
3292 }