llvm/test/CodeGen/AMDGPU/wqm.ll

   1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
   2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
   3
   4 ; Check that WQM isn't triggered by image load/store intrinsics.
   5 ;
   6 ;CHECK-LABEL: {{^}}test1:
   7 ;CHECK-NOT: s_wqm
   8 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
   9 main_body:
  10   %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
  11   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
  12   ret <4 x float> %tex
  13 }
  14
  15 ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
  16 ;
  17 ;CHECK-LABEL: {{^}}test2:
  18 ;CHECK-NEXT: ; %main_body
  19 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
  20 ;CHECK-NEXT: s_wqm_b64 exec, exec
  21 ;CHECK: interp
  22 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
  23 ;CHECK-NOT: interp
  24 ;CHECK: image_sample
  25 ;CHECK-NOT: exec
  26 ;CHECK: .size test2
  27 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
  28 main_body:
  29   %inst23 = extractelement <2 x float> %pos, i32 0
  30   %inst24 = extractelement <2 x float> %pos, i32 1
  31   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
  32   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
  33   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
  34   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
  35   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
  36   ret <4 x float> %tex
  37 }
  38
  39 ; ... but disabled for stores (and, in this simple case, not re-enabled) ...
  40 ;
  41 ;CHECK-LABEL: {{^}}test3:
  42 ;CHECK-NEXT: ; %main_body
  43 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
  44 ;CHECK-NEXT: s_wqm_b64 exec, exec
  45 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
  46 ;CHECK: image_sample
  47 ;CHECK: store
  48 ;CHECK-NOT: exec
  49 ;CHECK: .size test3
  50 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
  51 main_body:
  52   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
  53   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
  54   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
  55
  56   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
  57
  58   ret <4 x float> %tex
  59 }
  60
  61 ; ... and disabled for export.
  62 ;
  63 ;CHECK-LABEL: {{^}}test3x:
  64 ;CHECK-NEXT: ; %main_body
  65 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
  66 ;CHECK-NEXT: s_wqm_b64 exec, exec
  67 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
  68 ;CHECK: image_sample
  69 ;CHECK: exp
  70 ;CHECK-NOT: exec
  71 ;CHECK: .size test3x
  72 define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
  73 main_body:
  74   %inst23 = extractelement <2 x float> %pos, i32 0
  75   %inst24 = extractelement <2 x float> %pos, i32 1
  76   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
  77   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
  78   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
  79   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
  80   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
  81   %tex.0 = extractelement <4 x float> %tex, i32 0
  82   %tex.1 = extractelement <4 x float> %tex, i32 1
  83   %tex.2 = extractelement <4 x float> %tex, i32 2
  84   %tex.3 = extractelement <4 x float> %tex, i32 3
  85   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
  86   ret void
  87 }
  88
  89 ; Check that WQM is re-enabled when required.
  90 ;
  91 ;CHECK-LABEL: {{^}}test4:
  92 ;CHECK-NEXT: ; %main_body
  93 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
  94 ;CHECK-NEXT: s_wqm_b64 exec, exec
  95 ;CHECK: v_mul_lo_u32 [[MUL:v[0-9]+]], v0, v1
  96 ;CHECK: image_sample
  97 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
  98 ;CHECK: image_sample
  99 ;CHECK: store
 100 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
 101 main_body:
 102   %c.1 = mul i32 %c, %d
 103
 104   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
 105   %c.1.bc = bitcast i32 %c.1 to float
 106   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 107   %tex0 = extractelement <4 x float> %tex, i32 0
 108   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 109   ret <4 x float> %dtex
 110 }
 111
 112 ; Check that WQM is triggered by the wqm intrinsic.
 113 ;
 114 ;CHECK-LABEL: {{^}}test5:
 115 ;CHECK: s_wqm_b64 exec, exec
 116 ;CHECK: buffer_load_dword
 117 ;CHECK: buffer_load_dword
 118 ;CHECK: v_add_f32_e32
 119 ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
 120 ; does not happen - the v_add should write the return reg directly.
 121 ;CHECK-NOT: v_mov_b32_e32
 122 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
 123 main_body:
 124   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 125   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 126   %out = fadd float %src0, %src1
 127   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 128   ret float %out.0
 129 }
 130
 131 ; Check that the wqm intrinsic works correctly for integers.
 132 ;
 133 ;CHECK-LABEL: {{^}}test6:
 134 ;CHECK: s_wqm_b64 exec, exec
 135 ;CHECK: buffer_load_dword
 136 ;CHECK: buffer_load_dword
 137 ;CHECK: v_add_f32_e32
 138 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
 139 main_body:
 140   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 141   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 142   %out = fadd float %src0, %src1
 143   %out.0 = bitcast float %out to i32
 144   %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
 145   %out.2 = bitcast i32 %out.1 to float
 146   ret float %out.2
 147 }
 148
 149 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
 150
 151 ; Check that WWM is triggered by the wwm intrinsic.
 152 ;
 153 ;CHECK-LABEL: {{^}}test_wwm1:
 154 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
 155 ;CHECK: buffer_load_dword
 156 ;CHECK: buffer_load_dword
 157 ;CHECK: v_add_f32_e32
 158 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 159 main_body:
 160   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 161   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 162   %out = fadd float %src0, %src1
 163   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 164   ret float %out.0
 165 }
 166
 167 ; Same as above, but with an integer type.
 168 ;
 169 ;CHECK-LABEL: {{^}}test_wwm2:
 170 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
 171 ;CHECK: buffer_load_dword
 172 ;CHECK: buffer_load_dword
 173 ;CHECK: v_add_{{[iu]}}32_e32
 174 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
 175 main_body:
 176   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 177   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 178   %src0.0 = bitcast float %src0 to i32
 179   %src1.0 = bitcast float %src1 to i32
 180   %out = add i32 %src0.0, %src1.0
 181   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
 182   %out.1 = bitcast i32 %out.0 to float
 183   ret float %out.1
 184 }
 185
 186 ; Check that we don't leave WWM on for computations that don't require WWM,
 187 ; since that will lead clobbering things that aren't supposed to be clobbered
 188 ; in cases like this.
 189 ; We enforce this by checking that v_add gets emitted in the same block as
 190 ; WWM computations.
 191 ;
 192 ;CHECK-LABEL: {{^}}test_wwm3:
 193 ;CHECK: %if
 194 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
 195 ;CHECK: buffer_load_dword
 196 ;CHECK: v_add_f32_e32
 197 ;CHECK: s_mov_b64 exec, [[ORIG]]
 198 ;CHECK: v_add_f32_e32
 199 ;CHECK: %endif
 200 define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
 201 main_body:
 202   ; use mbcnt to make sure the branch is divergent
 203   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 204   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 205   %cc = icmp uge i32 %hi, 32
 206   br i1 %cc, label %endif, label %if
 207
 208 if:
 209   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
 210   %out = fadd float %src, %src
 211   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 212   %out.1 = fadd float %src, %out.0
 213   br label %endif
 214
 215 endif:
 216   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
 217   ret float %out.2
 218 }
 219
 220 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
 221 ; write could clobber disabled channels in the non-WWM one.
 222 ; We enforce this by checking that v_mov gets emitted in the same block as
 223 ; WWM computations.
 224 ;
 225 ;CHECK-LABEL: {{^}}test_wwm4:
 226 ;CHECK: %if
 227 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
 228 ;CHECK: buffer_load_dword
 229 ;CHECK: v_add_f32_e32
 230 ;CHECK: s_mov_b64 exec, [[ORIG]]
 231 ;CHECK-NEXT: v_mov_b32_e32
 232 ;CHECK: %endif
 233 define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
 234 main_body:
 235   ; use mbcnt to make sure the branch is divergent
 236   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 237   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 238   %cc = icmp uge i32 %hi, 32
 239   br i1 %cc, label %endif, label %if
 240
 241 if:
 242   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
 243   %out = fadd float %src, %src
 244   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 245   br label %endif
 246
 247 endif:
 248   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 249   ret float %out.1
 250 }
 251
 252 ; Make sure the transition from Exact to WWM then WQM works properly.
 253 ;
 254 ;CHECK-LABEL: {{^}}test_wwm5:
 255 ;CHECK: buffer_load_dword
 256 ;CHECK: buffer_store_dword
 257 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
 258 ;CHECK: buffer_load_dword
 259 ;CHECK: v_add_f32_e32
 260 ;CHECK: s_mov_b64 exec, [[ORIG]]
 261 ;CHECK: s_wqm_b64 exec, exec
 262 define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
 263 main_body:
 264   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 265   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 266   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 267   %temp = fadd float %src1, %src1
 268   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
 269   %out = fadd float %temp.0, %temp.0
 270   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 271   ret float %out.0
 272 }
 273
 274 ; Check that WWM is turned on correctly across basic block boundaries.
 275 ; if..then..endif version
 276 ;
 277 ;CHECK-LABEL: {{^}}test_wwm6_then:
 278 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
 279 ;SI-CHECK: buffer_load_dword
 280 ;VI-CHECK: flat_load_dword
 281 ;CHECK: s_mov_b64 exec, [[ORIG]]
 282 ;CHECK: %if
 283 ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
 284 ;SI-CHECK: buffer_load_dword
 285 ;VI-CHECK: flat_load_dword
 286 ;CHECK: v_add_f32_e32
 287 ;CHECK: s_mov_b64 exec, [[ORIG2]]
 288 ;CHECK: %endif
 289 define amdgpu_ps float @test_wwm6_then() {
 290 main_body:
 291   %src0 = load volatile float, float addrspace(1)* undef
 292   ; use mbcnt to make sure the branch is divergent
 293   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 294   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 295   %cc = icmp uge i32 %hi, 32
 296   br i1 %cc, label %endif, label %if
 297
 298 if:
 299   %src1 = load volatile float, float addrspace(1)* undef
 300   %out = fadd float %src0, %src1
 301   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 302   br label %endif
 303
 304 endif:
 305   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 306   ret float %out.1
 307 }
 308
 309 ; Check that WWM is turned on correctly across basic block boundaries.
 310 ; loop version
 311 ;
 312 ;CHECK-LABEL: {{^}}test_wwm6_loop:
 313 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
 314 ;SI-CHECK: buffer_load_dword
 315 ;VI-CHECK: flat_load_dword
 316 ;CHECK: s_mov_b64 exec, [[ORIG]]
 317 ;CHECK: %loop
 318 ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
 319 ;SI-CHECK: buffer_load_dword
 320 ;VI-CHECK: flat_load_dword
 321 ;CHECK: s_mov_b64 exec, [[ORIG2]]
 322 ;CHECK: %endloop
 323 define amdgpu_ps float @test_wwm6_loop() {
 324 main_body:
 325   %src0 = load volatile float, float addrspace(1)* undef
 326   ; use mbcnt to make sure the branch is divergent
 327   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 328   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 329   br label %loop
 330
 331 loop:
 332   %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
 333   %src1 = load volatile float, float addrspace(1)* undef
 334   %out = fadd float %src0, %src1
 335   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 336   %counter.1 = sub i32 %counter, 1
 337   %cc = icmp ne i32 %counter.1, 0
 338   br i1 %cc, label %loop, label %endloop
 339
 340 endloop:
 341   ret float %out.0
 342 }
 343
 344 ; Check that @llvm.amdgcn.set.inactive disables WWM.
 345 ;
 346 ;CHECK-LABEL: {{^}}test_wwm_set_inactive1:
 347 ;CHECK: buffer_load_dword
 348 ;CHECK: s_not_b64 exec, exec
 349 ;CHECK: v_mov_b32_e32
 350 ;CHECK: s_not_b64 exec, exec
 351 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
 352 ;CHECK: v_add_{{[iu]}}32_e32
 353 define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
 354 main_body:
 355   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
 356   %src.0 = bitcast float %src to i32
 357   %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
 358   %out = add i32 %src.1, %src.1
 359   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
 360   %out.1 = bitcast i32 %out.0 to float
 361   call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
 362   ret void
 363 }
 364
 365 ; Check that Strict WQM is triggered by the strict_wqm intrinsic.
 366 ;
 367 ;CHECK-LABEL: {{^}}test_strict_wqm1:
 368 ;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 369 ;CHECK: s_wqm_b64 exec, exec
 370 ;CHECK: buffer_load_dword
 371 ;CHECK: buffer_load_dword
 372 ;CHECK: v_add_f32_e32
 373 define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
 374 main_body:
 375   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 376   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 377   %out = fadd float %src0, %src1
 378   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
 379   ret float %out.0
 380 }
 381
 382 ; Same as above, but with an integer type.
 383 ;
 384 ;CHECK-LABEL: {{^}}test_strict_wqm2:
 385 ;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 386 ;CHECK: s_wqm_b64 exec, exec
 387 ;CHECK: buffer_load_dword
 388 ;CHECK: buffer_load_dword
 389 ;CHECK: v_add_{{[iu]}}32_e32
 390 define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
 391 main_body:
 392   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 393   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 394   %src0.0 = bitcast float %src0 to i32
 395   %src1.0 = bitcast float %src1 to i32
 396   %out = add i32 %src0.0, %src1.0
 397   %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
 398   %out.1 = bitcast i32 %out.0 to float
 399   ret float %out.1
 400 }
 401
 402 ; Check that we don't leave Strict WQM on for computations that don't require it,
 403 ; since that will lead clobbering things that aren't supposed to be clobbered
 404 ; in cases like this.
 405 ; We enforce this by checking that v_add gets emitted in the same block as
 406 ; WWM computations.
 407 ;
 408 ;CHECK-LABEL: {{^}}test_strict_wqm3:
 409 ;CHECK: %if
 410 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 411 ;CHECK: s_wqm_b64 exec, exec
 412 ;CHECK: buffer_load_dword
 413 ;CHECK: v_add_f32_e32
 414 ;CHECK: s_mov_b64 exec, [[ORIG]]
 415 ;CHECK: v_add_f32_e32
 416 ;CHECK: %endif
 417 define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
 418 main_body:
 419   ; use mbcnt to make sure the branch is divergent
 420   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 421   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 422   %cc = icmp uge i32 %hi, 32
 423   br i1 %cc, label %endif, label %if
 424
 425 if:
 426   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
 427   %out = fadd float %src, %src
 428   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
 429   %out.1 = fadd float %src, %out.0
 430   br label %endif
 431
 432 endif:
 433   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
 434   ret float %out.2
 435 }
 436
 437 ; Check that Strict WQM writes aren't coalesced with non-strict writes, since
 438 ; the Strict WQM write could clobber disabled channels in the non-strict one.
 439 ; We enforce this by checking that v_mov gets emitted in the same block as
 440 ; WWM computations.
 441 ;
 442 ;CHECK-LABEL: {{^}}test_strict_wqm4:
 443 ;CHECK: %if
 444 ;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 445 ;CHECK: s_wqm_b64 exec, exec
 446 ;CHECK: buffer_load_dword
 447 ;CHECK: v_add_f32_e32
 448 ;CHECK: s_mov_b64 exec, [[ORIG]]
 449 ;CHECK-NEXT: v_mov_b32_e32
 450 ;CHECK: %endif
 451 define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
 452 main_body:
 453   ; use mbcnt to make sure the branch is divergent
 454   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 455   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 456   %cc = icmp uge i32 %hi, 32
 457   br i1 %cc, label %endif, label %if
 458
 459 if:
 460   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
 461   %out = fadd float %src, %src
 462   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
 463   br label %endif
 464
 465 endif:
 466   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 467   ret float %out.1
 468 }
 469
 470 ; Make sure the transition from Exact to Strict WQM then WQM works properly.
 471 ;
 472 ;CHECK-LABEL: {{^}}test_strict_wqm5:
 473 ;CHECK: buffer_load_dword
 474 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 475 ;CHECK: buffer_store_dword
 476 ;CHECK: s_wqm_b64 exec, exec
 477 ;CHECK: buffer_load_dword
 478 ;CHECK: v_add_f32_e32
 479 ;CHECK: s_mov_b64 exec, [[ORIG]]
 480 ;CHECK: s_wqm_b64 exec, exec
 481 define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
 482 main_body:
 483   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 484   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 485   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 486   %temp = fadd float %src1, %src1
 487   %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
 488   %out = fadd float %temp.0, %temp.0
 489   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 490   ret float %out.0
 491 }
 492
 493 ; Check that Strict WQM is turned on correctly across basic block boundaries.
 494 ; if..then..endif version
 495 ;
 496 ;CHECK-LABEL: {{^}}test_strict_wqm6_then:
 497 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 498 ;CHECK: s_wqm_b64 exec, exec
 499 ;SI-CHECK: buffer_load_dword
 500 ;VI-CHECK: flat_load_dword
 501 ;CHECK: s_mov_b64 exec, [[ORIG]]
 502 ;CHECK: %if
 503 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 504 ;CHECK: s_wqm_b64 exec, exec
 505 ;SI-CHECK: buffer_load_dword
 506 ;VI-CHECK: flat_load_dword
 507 ;CHECK: v_add_f32_e32
 508 ;CHECK: s_mov_b64 exec, [[ORIG2]]
 509 ;CHECK: %endif
 510 define amdgpu_ps float @test_strict_wqm6_then() {
 511 main_body:
 512   %src0 = load volatile float, float addrspace(1)* undef
 513   ; use mbcnt to make sure the branch is divergent
 514   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 515   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 516   %cc = icmp uge i32 %hi, 32
 517   br i1 %cc, label %endif, label %if
 518
 519 if:
 520   %src1 = load volatile float, float addrspace(1)* undef
 521   %out = fadd float %src0, %src1
 522   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
 523   br label %endif
 524
 525 endif:
 526   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 527   ret float %out.1
 528 }
 529
 530 ; Check that Strict WQM is turned on correctly across basic block boundaries.
 531 ; loop version
 532 ;
 533 ;CHECK-LABEL: {{^}}test_strict_wqm6_loop:
 534 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 535 ;CHECK: s_wqm_b64 exec, exec
 536 ;SI-CHECK: buffer_load_dword
 537 ;VI-CHECK: flat_load_dword
 538 ;CHECK: s_mov_b64 exec, [[ORIG]]
 539 ;CHECK: %loop
 540 ;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
 541 ;CHECK: s_wqm_b64 exec, exec
 542 ;SI-CHECK: buffer_load_dword
 543 ;VI-CHECK: flat_load_dword
 544 ;CHECK: s_mov_b64 exec, [[ORIG2]]
 545 ;CHECK: %endloop
 546 define amdgpu_ps float @test_strict_wqm6_loop() {
 547 main_body:
 548   %src0 = load volatile float, float addrspace(1)* undef
 549   ; use mbcnt to make sure the branch is divergent
 550   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 551   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 552   br label %loop
 553
 554 loop:
 555   %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
 556   %src1 = load volatile float, float addrspace(1)* undef
 557   %out = fadd float %src0, %src1
 558   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
 559   %counter.1 = sub i32 %counter, 1
 560   %cc = icmp ne i32 %counter.1, 0
 561   br i1 %cc, label %loop, label %endloop
 562
 563 endloop:
 564   ret float %out.0
 565 }
 566
 567 ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
 568 ;
 569 ;CHECK-LABEL: {{^}}test_set_inactive2:
 570 ;CHECK: s_wqm_b64 exec, exec
 571 ;CHECK: buffer_load_dword
 572 ;CHECK: buffer_load_dword
 573 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
 574 main_body:
 575   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 576   %src1.0 = bitcast float %src1 to i32
 577   %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
 578   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 579   %src0.0 = bitcast float %src0 to i32
 580   %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
 581   %out = add i32 %src0.1, %src1.1
 582   %out.0 = bitcast i32 %out to float
 583   call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 584   ret void
 585 }
 586
 587 ; Check a case of one branch of an if-else requiring WQM, the other requiring
 588 ; exact.
 589 ;
 590 ; Note: In this particular case, the save-and-restore could be avoided if the
 591 ; analysis understood that the two branches of the if-else are mutually
 592 ; exclusive.
 593 ;
 594 ;CHECK-LABEL: {{^}}test_control_flow_0:
 595 ;CHECK-NEXT: ; %main_body
 596 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 597 ;CHECK-NEXT: s_wqm_b64 exec, exec
 598 ;CHECK: %ELSE
 599 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
 600 ;CHECK: store
 601 ;CHECK: s_mov_b64 exec, [[SAVED]]
 602 ;CHECK: %IF
 603 ;CHECK: image_sample
 604 ;CHECK: image_sample
 605 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
 606 main_body:
 607   %cmp = icmp eq i32 %z, 0
 608   br i1 %cmp, label %IF, label %ELSE
 609
 610 IF:
 611   %c.bc = bitcast i32 %c to float
 612   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 613   %tex0 = extractelement <4 x float> %tex, i32 0
 614   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 615   %data.if = extractelement <4 x float> %dtex, i32 0
 616   br label %END
 617
 618 ELSE:
 619   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
 620   br label %END
 621
 622 END:
 623   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
 624   ret float %r
 625 }
 626
 627 ; Reverse branch order compared to the previous test.
 628 ;
 629 ;CHECK-LABEL: {{^}}test_control_flow_1:
 630 ;CHECK-NEXT: ; %main_body
 631 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 632 ;CHECK-NEXT: s_wqm_b64 exec, exec
 633 ;CHECK: %IF
 634 ;CHECK: image_sample
 635 ;CHECK: image_sample
 636 ;CHECK: %Flow
 637 ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
 638 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
 639 ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
 640 ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
 641 ;CHECK-NEXT: s_cbranch_execz [[END_BB:BB[0-9]+_[0-9]+]]
 642 ;CHECK-NEXT: ; %bb.{{[0-9]+}}: ; %ELSE
 643 ;CHECK: store_dword
 644 ;CHECK: [[END_BB]]: ; %END
 645 ;CHECK: s_or_b64 exec, exec,
 646 ;CHECK: v_mov_b32_e32 v0
 647 ;CHECK: ; return
 648 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
 649 main_body:
 650   %cmp = icmp eq i32 %z, 0
 651   br i1 %cmp, label %ELSE, label %IF
 652
 653 IF:
 654   %c.bc = bitcast i32 %c to float
 655   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 656   %tex0 = extractelement <4 x float> %tex, i32 0
 657   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 658   %data.if = extractelement <4 x float> %dtex, i32 0
 659   br label %END
 660
 661 ELSE:
 662   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
 663   br label %END
 664
 665 END:
 666   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
 667   ret float %r
 668 }
 669
 670 ; Check that branch conditions are properly marked as needing WQM...
 671 ;
 672 ;CHECK-LABEL: {{^}}test_control_flow_2:
 673 ;CHECK-NEXT: ; %main_body
 674 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 675 ;CHECK-NEXT: s_wqm_b64 exec, exec
 676 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 677 ;CHECK: store
 678 ;CHECK: s_wqm_b64 exec, exec
 679 ;CHECK: load
 680 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 681 ;CHECK: store
 682 ;CHECK: s_wqm_b64 exec, exec
 683 ;CHECK: v_cmp
 684 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
 685 main_body:
 686   %idx.1 = extractelement <3 x i32> %idx, i32 0
 687   %data.1 = extractelement <2 x float> %data, i32 0
 688   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
 689
 690   ; The load that determines the branch (and should therefore be WQM) is
 691   ; surrounded by stores that require disabled WQM.
 692   %idx.2 = extractelement <3 x i32> %idx, i32 1
 693   %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0)
 694
 695   %idx.3 = extractelement <3 x i32> %idx, i32 2
 696   %data.3 = extractelement <2 x float> %data, i32 1
 697   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0)
 698
 699   %cc = fcmp ogt float %z, 0.0
 700   br i1 %cc, label %IF, label %ELSE
 701
 702 IF:
 703   %coord.IF = mul i32 %coord, 3
 704   br label %END
 705
 706 ELSE:
 707   %coord.ELSE = mul i32 %coord, 4
 708   br label %END
 709
 710 END:
 711   %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
 712   %coord.END.bc = bitcast i32 %coord.END to float
 713   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 714   ret <4 x float> %tex
 715 }
 716
 717 ; ... but only if they really do need it.
 718 ;
 719 ;CHECK-LABEL: {{^}}test_control_flow_3:
 720 ;CHECK-NEXT: ; %main_body
 721 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 722 ;CHECK-NEXT: s_wqm_b64 exec, exec
 723 ;CHECK: image_sample
 724 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 725 ;CHECK: image_sample
 726 ;CHECK-DAG: v_cmp
 727 ;CHECK-DAG: store
 728 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
 729 main_body:
 730   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 731   %tex0 = extractelement <4 x float> %tex, i32 0
 732   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 733   %dtex.1 = extractelement <4 x float> %dtex, i32 0
 734   call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
 735
 736   %cc = fcmp ogt float %dtex.1, 0.0
 737   br i1 %cc, label %IF, label %ELSE
 738
 739 IF:
 740   %tex.IF = fmul float %dtex.1, 3.0
 741   br label %END
 742
 743 ELSE:
 744   %tex.ELSE = fmul float %dtex.1, 4.0
 745   br label %END
 746
 747 END:
 748   %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
 749   ret float %tex.END
 750 }
 751
 752 ; Another test that failed at some point because of terminator handling.
 753 ;
 754 ;CHECK-LABEL: {{^}}test_control_flow_4:
 755 ;CHECK-NEXT: ; %main_body
 756 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 757 ;CHECK-NEXT: s_wqm_b64 exec, exec
 758 ;CHECK: %IF
 759 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
 760 ;CHECK: load
 761 ;CHECK: store
 762 ;CHECK: s_mov_b64 exec, [[SAVE]]
 763 ;CHECK: %END
 764 ;CHECK: image_sample
 765 ;CHECK: image_sample
 766 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
 767 main_body:
 768   %cond = icmp eq i32 %y, 0
 769   br i1 %cond, label %IF, label %END
 770
 771 IF:
 772   %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0)
 773   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
 774   br label %END
 775
 776 END:
 777   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 778   %tex0 = extractelement <4 x float> %tex, i32 0
 779   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 780   ret <4 x float> %dtex
 781 }
 782
 783 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
 784 ;
 785 ;CHECK-LABEL: {{^}}test_kill_0:
 786 ;CHECK-NEXT: ; %main_body
 787 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 788 ;CHECK-NEXT: s_wqm_b64 exec, exec
 789 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 790 ;CHECK: image_sample
 791 ;CHECK: buffer_store_dword
 792 ;CHECK: s_wqm_b64 exec, exec
 793 ;CHECK: v_cmp_
 794 ;CHECK: image_sample
 795 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 796 ;CHECK: image_sample
 797 ;CHECK: buffer_store_dword
 798 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
 799 main_body:
 800   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 801   %idx.0 = extractelement <2 x i32> %idx, i32 0
 802   %data.0 = extractelement <2 x float> %data, i32 0
 803   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0)
 804
 805   %z.cmp = fcmp olt float %z, 0.0
 806   call void @llvm.amdgcn.kill(i1 %z.cmp)
 807
 808   %idx.1 = extractelement <2 x i32> %idx, i32 1
 809   %data.1 = extractelement <2 x float> %data, i32 1
 810   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
 811   %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 812   %tex2.0 = extractelement <4 x float> %tex2, i32 0
 813   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 814   %out = fadd <4 x float> %tex, %dtex
 815
 816   ret <4 x float> %out
 817 }
 818
 819 ; ... but only if WQM is necessary.
 820 ;
 821 ; CHECK-LABEL: {{^}}test_kill_1:
 822 ; CHECK-NEXT: ; %main_body
 823 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 824 ; CHECK: s_wqm_b64 exec, exec
 825 ; CHECK: image_sample
 826 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
 827 ; CHECK: image_sample
 828 ; CHECK-NOT: wqm
 829 ; CHECK-DAG: buffer_store_dword
 830 ; CHECK-DAG: v_cmp_
 831 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
 832 main_body:
 833   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 834   %tex0 = extractelement <4 x float> %tex, i32 0
 835   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 836
 837   call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
 838
 839   %z.cmp = fcmp olt float %z, 0.0
 840   call void @llvm.amdgcn.kill(i1 %z.cmp)
 841
 842   ret <4 x float> %dtex
 843 }
 844
 845 ; Check prolog shaders.
 846 ;
 847 ; CHECK-LABEL: {{^}}test_prolog_1:
 848 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 849 ; CHECK: s_wqm_b64 exec, exec
 850 ; CHECK: v_add_f32_e32 v0,
 851 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
 852 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
 853 main_body:
 854   %s = fadd float %a, %b
 855   ret float %s
 856 }
 857
 858 ; CHECK-LABEL: {{^}}test_loop_vcc:
 859 ; CHECK-NEXT: ; %entry
 860 ; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
 861 ; CHECK: s_wqm_b64 exec, exec
 862 ; CHECK: v_mov
 863 ; CHECK: v_mov
 864 ; CHECK: v_mov
 865 ; CHECK: v_mov
 866 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 867 ; CHECK: image_store
 868 ; CHECK: s_wqm_b64 exec, exec
 869 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
 870 ; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
 871
 872 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
 873 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
 874 ; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
 875 ; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
 876 ; CHECK: s_cbranch_vccz [[LOOPHDR]]
 877
 878 ; CHECK: ; %break
 879 ; CHECK: ; return
 880 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 881 entry:
 882   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
 883   br label %loop
 884
 885 loop:
 886   %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
 887   %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
 888   %cc = fcmp ogt float %ctr.iv, 7.0
 889   br i1 %cc, label %break, label %body
 890
 891 body:
 892   %c.iv0 = extractelement <4 x float> %c.iv, i32 0
 893   %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
 894   %ctr.next = fadd float %ctr.iv, 2.0
 895   br label %loop
 896
 897 break:
 898   ret <4 x float> %c.iv
 899 }
 900
 901 ; Only intrinsic stores need exact execution -- other stores do not have
 902 ; externally visible effects and may require WQM for correctness.
 903 ;
 904 ; CHECK-LABEL: {{^}}test_alloca:
 905 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
 906 ; CHECK: s_wqm_b64 exec, exec
 907
 908 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 909 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
 910 ; CHECK: s_wqm_b64 exec, exec
 911 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 912 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 913 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
 914 ; CHECK: s_wqm_b64 exec, exec
 915 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 916
 917 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 918 ; CHECK: image_sample
 919 ; CHECK: buffer_store_dwordx4
 920 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 921 entry:
 922   %array = alloca [32 x i32], align 4, addrspace(5)
 923
 924   call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
 925
 926   %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0
 927   store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4
 928
 929   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
 930
 931   %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx
 932   %c = load i32, i32 addrspace(5)* %c.gep, align 4
 933   %c.bc = bitcast i32 %c to float
 934   %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
 935   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0)
 936
 937   ret void
 938 }
 939
 940 ; Must return to exact at the end of a non-void returning shader,
 941 ; otherwise the EXEC mask exported by the epilog will be wrong. This is true
 942 ; even if the shader has no kills, because a kill could have happened in a
 943 ; previous shader fragment.
 944 ;
 945 ; CHECK-LABEL: {{^}}test_nonvoid_return:
 946 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
 947 ; CHECK: s_wqm_b64 exec, exec
 948 ;
 949 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 950 ; CHECK-NOT: exec
 951 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
 952   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
 953   %tex0 = extractelement <4 x float> %tex, i32 0
 954   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
 955   ret <4 x float> %dtex
 956 }
 957
 958 ; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
 959 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
 960 ; CHECK: s_wqm_b64 exec, exec
 961 ;
 962 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 963 ; CHECK-NOT: exec
 964 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
 965 entry:
 966   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
 967   %tex0 = extractelement <4 x float> %tex, i32 0
 968   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
 969   %cc = icmp sgt i32 %c, 0
 970   br i1 %cc, label %if, label %else
 971
 972 if:
 973   store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
 974   unreachable
 975
 976 else:
 977   ret <4 x float> %dtex
 978 }
 979
 980 ; Test awareness that s_wqm_b64 clobbers SCC.
 981 ;
 982 ; CHECK-LABEL: {{^}}test_scc:
 983 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 984 ; CHECK: s_wqm_b64 exec, exec
 985 ; CHECK: s_cmp_
 986 ; CHECK-NEXT: s_cbranch_scc
 987 ; CHECK: ; %else
 988 ; CHECK: image_sample
 989 ; CHECK: ; %if
 990 ; CHECK: image_sample
 991 ; CHECK: ; %end
 992 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
 993 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 994 main_body:
 995   %cc = icmp sgt i32 %sel, 0
 996   br i1 %cc, label %if, label %else
 997
 998 if:
 999   %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
1000   br label %end
1001
1002 else:
1003   %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
1004   br label %end
1005
1006 end:
1007   %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
1008   call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
1009   ret <4 x float> %r
1010 }
1011
1012 ; Check a case of a block being entirely WQM except for a bit of WWM.
1013 ; There was a bug where it forgot to enter and leave WWM.
1014 ;
1015 ;CHECK-LABEL: {{^}}test_wwm_within_wqm:
1016 ;CHECK: %IF
1017 ;CHECK: s_or_saveexec_b64 {{.*}}, -1
1018 ;CHECK: ds_swizzle
1019 ;
1020 define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1021 main_body:
1022   %c.bc = bitcast i32 %c to float
1023   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1024   %tex0 = extractelement <4 x float> %tex, i32 0
1025   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1026   %cmp = icmp eq i32 %z, 0
1027   br i1 %cmp, label %IF, label %ENDIF
1028
1029 IF:
1030   %dataf = extractelement <4 x float> %dtex, i32 0
1031   %data1 = fptosi float %dataf to i32
1032   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
1033   %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
1034   %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
1035   %data4f = sitofp i32 %data4 to float
1036   br label %ENDIF
1037
1038 ENDIF:
1039   %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
1040   ret float %r
1041 }
1042
1043 ; Check that WWM is triggered by the strict_wwm intrinsic.
1044 ;
1045 ;CHECK-LABEL: {{^}}test_strict_wwm1:
1046 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
1047 ;CHECK: buffer_load_dword
1048 ;CHECK: buffer_load_dword
1049 ;CHECK: v_add_f32_e32
1050 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
1051 main_body:
1052   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
1053   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
1054   %out = fadd float %src0, %src1
1055   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
1056   ret float %out.0
1057 }
1058
1059 ; Same as above, but with an integer type.
1060 ;
1061 ;CHECK-LABEL: {{^}}test_strict_wwm2:
1062 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
1063 ;CHECK: buffer_load_dword
1064 ;CHECK: buffer_load_dword
1065 ;CHECK: v_add_{{[iu]}}32_e32
1066 define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
1067 main_body:
1068   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
1069   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
1070   %src0.0 = bitcast float %src0 to i32
1071   %src1.0 = bitcast float %src1 to i32
1072   %out = add i32 %src0.0, %src1.0
1073   %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
1074   %out.1 = bitcast i32 %out.0 to float
1075   ret float %out.1
1076 }
1077
1078 ; Check that we don't leave WWM on for computations that don't require WWM,
1079 ; since that will lead clobbering things that aren't supposed to be clobbered
1080 ; in cases like this.
1081 ; We enforce this by checking that v_add gets emitted in the same block as
1082 ; WWM computations.
1083 ;
1084 ;CHECK-LABEL: {{^}}test_strict_wwm3:
1085 ;CHECK: %if
1086 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
1087 ;CHECK: buffer_load_dword
1088 ;CHECK: v_add_f32_e32
1089 ;CHECK: s_mov_b64 exec, [[ORIG]]
1090 ;CHECK: v_add_f32_e32
1091 ;CHECK: %endif
1092 define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
1093 main_body:
1094   ; use mbcnt to make sure the branch is divergent
1095   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1096   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1097   %cc = icmp uge i32 %hi, 32
1098   br i1 %cc, label %endif, label %if
1099
1100 if:
1101   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
1102   %out = fadd float %src, %src
1103   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
1104   %out.1 = fadd float %src, %out.0
1105   br label %endif
1106
1107 endif:
1108   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
1109   ret float %out.2
1110 }
1111
1112 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
1113 ; write could clobber disabled channels in the non-WWM one.
1114 ; We enforce this by checking that v_mov gets emitted in the same block as
1115 ; WWM computations.
1116 ;
1117 ;CHECK-LABEL: {{^}}test_strict_wwm4:
1118 ;CHECK: %if
1119 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
1120 ;CHECK: buffer_load_dword
1121 ;CHECK: v_add_f32_e32
1122 ;CHECK: s_mov_b64 exec, [[ORIG]]
1123 ;CHECK-NEXT: v_mov_b32_e32
1124 ;CHECK: %endif
1125 define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
1126 main_body:
1127   ; use mbcnt to make sure the branch is divergent
1128   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1129   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1130   %cc = icmp uge i32 %hi, 32
1131   br i1 %cc, label %endif, label %if
1132
1133 if:
1134   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
1135   %out = fadd float %src, %src
1136   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
1137   br label %endif
1138
1139 endif:
1140   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1141   ret float %out.1
1142 }
1143
1144 ; Make sure the transition from Exact to WWM then WQM works properly.
1145 ;
1146 ;CHECK-LABEL: {{^}}test_strict_wwm5:
1147 ;CHECK: buffer_load_dword
1148 ;CHECK: buffer_store_dword
1149 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
1150 ;CHECK: buffer_load_dword
1151 ;CHECK: v_add_f32_e32
1152 ;CHECK: s_mov_b64 exec, [[ORIG]]
1153 ;CHECK: s_wqm_b64 exec, exec
1154 define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
1155 main_body:
1156   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
1157   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
1158   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
1159   %temp = fadd float %src1, %src1
1160   %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
1161   %out = fadd float %temp.0, %temp.0
1162   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
1163   ret float %out.0
1164 }
1165
1166 ; Check that WWM is turned on correctly across basic block boundaries.
1167 ; if..then..endif version
1168 ;
1169 ;CHECK-LABEL: {{^}}test_strict_wwm6_then:
1170 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
1171 ;SI-CHECK: buffer_load_dword
1172 ;VI-CHECK: flat_load_dword
1173 ;CHECK: s_mov_b64 exec, [[ORIG]]
1174 ;CHECK: %if
1175 ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
1176 ;SI-CHECK: buffer_load_dword
1177 ;VI-CHECK: flat_load_dword
1178 ;CHECK: v_add_f32_e32
1179 ;CHECK: s_mov_b64 exec, [[ORIG2]]
1180 ;CHECK: %endif
1181 define amdgpu_ps float @test_strict_wwm6_then() {
1182 main_body:
1183   %src0 = load volatile float, float addrspace(1)* undef
1184   ; use mbcnt to make sure the branch is divergent
1185   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1186   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1187   %cc = icmp uge i32 %hi, 32
1188   br i1 %cc, label %endif, label %if
1189
1190 if:
1191   %src1 = load volatile float, float addrspace(1)* undef
1192   %out = fadd float %src0, %src1
1193   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
1194   br label %endif
1195
1196 endif:
1197   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1198   ret float %out.1
1199 }
1200
1201 ; Check that WWM is turned on correctly across basic block boundaries.
1202 ; loop version
1203 ;
1204 ;CHECK-LABEL: {{^}}test_strict_wwm6_loop:
1205 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
1206 ;SI-CHECK: buffer_load_dword
1207 ;VI-CHECK: flat_load_dword
1208 ;CHECK: s_mov_b64 exec, [[ORIG]]
1209 ;CHECK: %loop
1210 ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
1211 ;SI-CHECK: buffer_load_dword
1212 ;VI-CHECK: flat_load_dword
1213 ;CHECK: s_mov_b64 exec, [[ORIG2]]
1214 ;CHECK: %endloop
1215 define amdgpu_ps float @test_strict_wwm6_loop() {
1216 main_body:
1217   %src0 = load volatile float, float addrspace(1)* undef
1218   ; use mbcnt to make sure the branch is divergent
1219   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1220   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1221   br label %loop
1222
1223 loop:
1224   %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
1225   %src1 = load volatile float, float addrspace(1)* undef
1226   %out = fadd float %src0, %src1
1227   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
1228   %counter.1 = sub i32 %counter, 1
1229   %cc = icmp ne i32 %counter.1, 0
1230   br i1 %cc, label %loop, label %endloop
1231
1232 endloop:
1233   ret float %out.0
1234 }
1235
1236 ; Check that @llvm.amdgcn.set.inactive disables WWM.
1237 ;
1238 ;CHECK-LABEL: {{^}}test_strict_wwm_set_inactive1:
1239 ;CHECK: buffer_load_dword
1240 ;CHECK: s_not_b64 exec, exec
1241 ;CHECK: v_mov_b32_e32
1242 ;CHECK: s_not_b64 exec, exec
1243 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
1244 ;CHECK: v_add_{{[iu]}}32_e32
1245 define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
1246 main_body:
1247   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
1248   %src.0 = bitcast float %src to i32
1249   %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
1250   %out = add i32 %src.1, %src.1
1251   %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
1252   %out.1 = bitcast i32 %out.0 to float
1253   call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
1254   ret void
1255 }
1256
1257 ; Check a case of a block being entirely WQM except for a bit of WWM.
1258 ; There was a bug where it forgot to enter and leave WWM.
1259 ;
1260 ;CHECK-LABEL: {{^}}test_strict_wwm_within_wqm:
1261 ;CHECK: %IF
1262 ;CHECK: s_or_saveexec_b64 {{.*}}, -1
1263 ;CHECK: ds_swizzle
1264 ;
1265 define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1266 main_body:
1267   %c.bc = bitcast i32 %c to float
1268   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1269   %tex0 = extractelement <4 x float> %tex, i32 0
1270   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1271   %cmp = icmp eq i32 %z, 0
1272   br i1 %cmp, label %IF, label %ENDIF
1273
1274 IF:
1275   %dataf = extractelement <4 x float> %dtex, i32 0
1276   %data1 = fptosi float %dataf to i32
1277   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
1278   %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
1279   %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
1280   %data4f = sitofp i32 %data4 to float
1281   br label %ENDIF
1282
1283 ENDIF:
1284   %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
1285   ret float %r
1286 }
1287
1288 ; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
1289 ;
1290 ;CHECK-LABEL: {{^}}test_strict_wqm_within_wqm:
1291 ;CHECK: %IF
1292 ;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
1293 ;CHECK: s_wqm_b64 exec, exec
1294 ;CHECK: ds_swizzle
1295 ;
1296 define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1297 main_body:
1298   %c.bc = bitcast i32 %c to float
1299   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1300   %tex0 = extractelement <4 x float> %tex, i32 0
1301   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1302   %cmp = icmp eq i32 %z, 0
1303   br i1 %cmp, label %IF, label %ENDIF
1304
1305 IF:
1306   %dataf = extractelement <4 x float> %dtex, i32 0
1307   %data1 = fptosi float %dataf to i32
1308   %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
1309   %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
1310   %data3f = sitofp i32 %data3 to float
1311   br label %ENDIF
1312
1313 ENDIF:
1314   %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
1315   ret float %r
1316 }
1317
1318 ;CHECK-LABEL: {{^}}test_strict_wqm_strict_wwm_wqm:
1319 ;CHECK: buffer_store_dword
1320
1321 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
1322 ;CHECK: s_wqm_b64 exec, exec
1323 ;CHECK: buffer_load_dword
1324 ;CHECK: s_mov_b64 exec, [[ORIG]]
1325
1326 ;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
1327 ;CHECK: buffer_load_dword
1328 ;CHECK: s_mov_b64 exec, [[ORIG2]]
1329
1330 ;CHECK: s_mov_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], exec
1331 ;CHECK: s_wqm_b64 exec, exec
1332 ;CHECK: v_add
1333 ;CHECK: s_mov_b64 exec, [[ORIG3]]
1334
1335 ;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
1336 ;CHECK: s_wqm_b64 exec, exec
1337 ;CHECK: image_sample
1338
1339 define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) {
1340 main_body:
1341   call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
1342   %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
1343   %temp = fadd float %reload, %reload
1344   %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
1345   %temp3 = fadd float %temp2, %temp2
1346   %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0)
1347   %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
1348   %temp5 = fadd float %temp3, %temp4
1349   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0)
1350   call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
1351   %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
1352   ret float %out
1353 }
1354
1355 ;CHECK-LABEL: {{^}}test_strict_wwm_strict_wqm_wqm:
1356 ;CHECK: buffer_store_dword
1357
1358 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
1359 ;CHECK: buffer_load_dword
1360 ;CHECK: s_mov_b64 exec, [[ORIG]]
1361
1362 ;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
1363 ;CHECK: s_wqm_b64 exec, exec
1364 ;CHECK: buffer_load_dword
1365 ;CHECK: s_mov_b64 exec, [[ORIG2]]
1366
1367 ;CHECK: s_or_saveexec_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], -1
1368 ;CHECK: v_add
1369 ;CHECK: s_mov_b64 exec, [[ORIG3]]
1370
1371 ;CHECK: s_wqm_b64 exec, exec
1372 ;CHECK: image_sample
1373 define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
1374 main_body:
1375   call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1376   %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
1377   %temp = fadd float %reload, %reload
1378   %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
1379   %temp3 = fadd float %temp2, %temp2
1380   %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1381   %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
1382   %temp5 = fadd float %temp3, %temp4
1383   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
1384   call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1385   %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1386   ret float %out
1387 }
1388
1389 ;CHECK-LABEL: {{^}}test_wqm_strict_wqm_wqm:
1390 ;CHECK: buffer_store_dword
1391
1392 ;CHECK: s_wqm_b64 exec, exec
1393
1394 ;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
1395 ;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec
1396 ;CHECK: s_wqm_b64 exec, exec
1397 ;CHECK: buffer_load_dword
1398 ;CHECK: s_mov_b64 exec, [[ORIG2]]
1399
1400 ;CHECK: image_sample
1401
1402 define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
1403 main_body:
1404   call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1405   %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
1406   %temp = fadd float %reload, %reload
1407   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
1408   %temp2 = fadd float %tex, %tex
1409   %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1410   %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
1411   %temp4 = fadd float %temp2, %temp3
1412   %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
1413   call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1414   %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
1415   ret float %out
1416 }
1417
1418 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
1419 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
1420
1421 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
1422 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
1423 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
1424 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
1425 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
1426 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
1427
1428 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
1429 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
1430 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
1431 declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
1432 declare void @llvm.amdgcn.kill(i1) #1
1433 declare float @llvm.amdgcn.wqm.f32(float) #3
1434 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
1435 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
1436 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
1437 declare float @llvm.amdgcn.wwm.f32(float) #3
1438 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
1439 declare float @llvm.amdgcn.strict.wqm.f32(float) #3
1440 declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
1441 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
1442 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
1443 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
1444 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
1445 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
1446 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
1447 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
1448 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
1449
1450 attributes #1 = { nounwind }
1451 attributes #2 = { nounwind readonly }
1452 attributes #3 = { nounwind readnone }
1453 attributes #4 = { nounwind readnone convergent }
1454 attributes #5 = { "amdgpu-ps-wqm-outputs" }
1455 attributes #6 = { nounwind "InitialPSInputAddr"="2" }