llvm/test/CodeGen/AMDGPU/wqm.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
   4
   5 ; Check that WQM isn't triggered by image load/store intrinsics.
   6 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
   7 ; GFX9-W64-LABEL: test1:
   8 ; GFX9-W64:       ; %bb.0: ; %main_body
   9 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
  10 ; GFX9-W64-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm
  11 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  12 ; GFX9-W64-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm
  13 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  14 ; GFX9-W64-NEXT:    ; return to shader part epilog
  15 ;
  16 ; GFX10-W32-LABEL: test1:
  17 ; GFX10-W32:       ; %bb.0: ; %main_body
  18 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
  19 ; GFX10-W32-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
  20 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  21 ; GFX10-W32-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
  22 ; GFX10-W32-NEXT:    ; return to shader part epilog
  23 main_body:
  24   %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
  25   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
  26   ret <4 x float> %tex
  27 }
  28
  29 ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
  30 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
  31 ; GFX9-W64-LABEL: test2:
  32 ; GFX9-W64:       ; %bb.0: ; %main_body
  33 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
  34 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
  35 ; GFX9-W64-NEXT:    s_mov_b32 m0, s3
  36 ; GFX9-W64-NEXT:    s_nop 0
  37 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
  38 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
  39 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
  40 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
  41 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
  42 ; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
  43 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  44 ; GFX9-W64-NEXT:    ; return to shader part epilog
  45 ;
  46 ; GFX10-W32-LABEL: test2:
  47 ; GFX10-W32:       ; %bb.0: ; %main_body
  48 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
  49 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  50 ; GFX10-W32-NEXT:    s_mov_b32 m0, s3
  51 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
  52 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
  53 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
  54 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
  55 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
  56 ; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
  57 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  58 ; GFX10-W32-NEXT:    ; return to shader part epilog
  59 main_body:
  60   %inst23 = extractelement <2 x float> %pos, i32 0
  61   %inst24 = extractelement <2 x float> %pos, i32 1
  62   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
  63   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
  64   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
  65   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
  66   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
  67   ret <4 x float> %tex
  68 }
  69
  70 ; ... but disabled for stores (and, in this simple case, not re-enabled) ...
  71 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
  72 ; GFX9-W64-LABEL: test3:
  73 ; GFX9-W64:       ; %bb.0: ; %main_body
  74 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
  75 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
  76 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
  77 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
  78 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  79 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
  80 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  81 ; GFX9-W64-NEXT:    ; return to shader part epilog
  82 ;
  83 ; GFX10-W32-LABEL: test3:
  84 ; GFX10-W32:       ; %bb.0: ; %main_body
  85 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
  86 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  87 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
  88 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
  89 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  90 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
  91 ; GFX10-W32-NEXT:    ; return to shader part epilog
  92 main_body:
  93   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
  94   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
  95   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
  96
  97   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
  98
  99   ret <4 x float> %tex
 100 }
 101
 102 define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
 103 ; GFX9-W64-LABEL: test3_ptr_buf:
 104 ; GFX9-W64:       ; %bb.0: ; %main_body
 105 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
 106 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 107 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
 108 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 109 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 110 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
 111 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 112 ; GFX9-W64-NEXT:    ; return to shader part epilog
 113 ;
 114 ; GFX10-W32-LABEL: test3_ptr_buf:
 115 ; GFX10-W32:       ; %bb.0: ; %main_body
 116 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 117 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 118 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 119 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 120 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 121 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
 122 ; GFX10-W32-NEXT:    ; return to shader part epilog
 123 main_body:
 124   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 125   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
 126   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
 127
 128   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0)
 129
 130   ret <4 x float> %tex
 131 }
 132
 133 ; ... and disabled for export.
 134 define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
 135 ; GFX9-W64-LABEL: test3x:
 136 ; GFX9-W64:       ; %bb.0: ; %main_body
 137 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
 138 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 139 ; GFX9-W64-NEXT:    s_mov_b32 m0, s3
 140 ; GFX9-W64-NEXT:    s_nop 0
 141 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
 142 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
 143 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
 144 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
 145 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 146 ; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
 147 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 148 ; GFX9-W64-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
 149 ; GFX9-W64-NEXT:    s_endpgm
 150 ;
 151 ; GFX10-W32-LABEL: test3x:
 152 ; GFX10-W32:       ; %bb.0: ; %main_body
 153 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 154 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 155 ; GFX10-W32-NEXT:    s_mov_b32 m0, s3
 156 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
 157 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
 158 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
 159 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
 160 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 161 ; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
 162 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 163 ; GFX10-W32-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
 164 ; GFX10-W32-NEXT:    s_endpgm
 165 main_body:
 166   %inst23 = extractelement <2 x float> %pos, i32 0
 167   %inst24 = extractelement <2 x float> %pos, i32 1
 168   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
 169   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
 170   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
 171   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
 172   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 173   %tex.0 = extractelement <4 x float> %tex, i32 0
 174   %tex.1 = extractelement <4 x float> %tex, i32 1
 175   %tex.2 = extractelement <4 x float> %tex, i32 2
 176   %tex.3 = extractelement <4 x float> %tex, i32 3
 177   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
 178   ret void
 179 }
 180
 181 ; Check that WQM is re-enabled when required.
 182 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
 183 ; GFX9-W64-LABEL: test4:
 184 ; GFX9-W64:       ; %bb.0: ; %main_body
 185 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
 186 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 187 ; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
 188 ; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
 189 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
 190 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 191 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 192 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 193 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 194 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 195 ; GFX9-W64-NEXT:    ; return to shader part epilog
 196 ;
 197 ; GFX10-W32-LABEL: test4:
 198 ; GFX10-W32:       ; %bb.0: ; %main_body
 199 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 200 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 201 ; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
 202 ; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 203 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 204 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 205 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 206 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 207 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 208 ; GFX10-W32-NEXT:    ; return to shader part epilog
 209 main_body:
 210   %c.1 = mul i32 %c, %d
 211
 212   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
 213   %c.1.bc = bitcast i32 %c.1 to float
 214   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 215   %tex0 = extractelement <4 x float> %tex, i32 0
 216   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 217   ret <4 x float> %dtex
 218 }
 219
 220 define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
 221 ; GFX9-W64-LABEL: test4_ptr_buf:
 222 ; GFX9-W64:       ; %bb.0: ; %main_body
 223 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
 224 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 225 ; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
 226 ; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
 227 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
 228 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 229 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 230 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 231 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 232 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 233 ; GFX9-W64-NEXT:    ; return to shader part epilog
 234 ;
 235 ; GFX10-W32-LABEL: test4_ptr_buf:
 236 ; GFX10-W32:       ; %bb.0: ; %main_body
 237 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 238 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 239 ; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
 240 ; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 241 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 242 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 243 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 244 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 245 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 246 ; GFX10-W32-NEXT:    ; return to shader part epilog
 247 main_body:
 248   %c.1 = mul i32 %c, %d
 249
 250   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0)
 251   %c.1.bc = bitcast i32 %c.1 to float
 252   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 253   %tex0 = extractelement <4 x float> %tex, i32 0
 254   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 255   ret <4 x float> %dtex
 256 }
 257
 258 ; Check that WQM is triggered by the wqm intrinsic.
 259 ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
 260 ; does not happen - the v_add should write the return reg directly.
 261 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
 262 ; GFX9-W64-LABEL: test5:
 263 ; GFX9-W64:       ; %bb.0: ; %main_body
 264 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 265 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 266 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 267 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 268 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 269 ; GFX9-W64-NEXT:    s_nop 0
 270 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 271 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 272 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 273 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 274 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 275 ; GFX9-W64-NEXT:    ; return to shader part epilog
 276 ;
 277 ; GFX10-W32-LABEL: test5:
 278 ; GFX10-W32:       ; %bb.0: ; %main_body
 279 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 280 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 281 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 282 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 283 ; GFX10-W32-NEXT:    s_clause 0x1
 284 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 285 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 286 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 287 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 288 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 289 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 290 ; GFX10-W32-NEXT:    ; return to shader part epilog
 291 main_body:
 292   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 293   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 294   %out = fadd float %src0, %src1
 295   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 296   ret float %out.0
 297 }
 298
 299 define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
 300 ; GFX9-W64-LABEL: test5_ptr_buf:
 301 ; GFX9-W64:       ; %bb.0: ; %main_body
 302 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 303 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 304 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 305 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 306 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 307 ; GFX9-W64-NEXT:    s_nop 0
 308 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 309 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 310 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 311 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 312 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 313 ; GFX9-W64-NEXT:    ; return to shader part epilog
 314 ;
 315 ; GFX10-W32-LABEL: test5_ptr_buf:
 316 ; GFX10-W32:       ; %bb.0: ; %main_body
 317 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 318 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 319 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 320 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 321 ; GFX10-W32-NEXT:    s_clause 0x1
 322 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 323 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 324 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 325 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 326 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 327 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 328 ; GFX10-W32-NEXT:    ; return to shader part epilog
 329 main_body:
 330   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 331   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 332   %out = fadd float %src0, %src1
 333   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 334   ret float %out.0
 335 }
 336
 337 ; Check that the wqm intrinsic works correctly for integers.
 338 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
 339 ; GFX9-W64-LABEL: test6:
 340 ; GFX9-W64:       ; %bb.0: ; %main_body
 341 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 342 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 343 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 344 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 345 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 346 ; GFX9-W64-NEXT:    s_nop 0
 347 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 348 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 349 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 350 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 351 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 352 ; GFX9-W64-NEXT:    ; return to shader part epilog
 353 ;
 354 ; GFX10-W32-LABEL: test6:
 355 ; GFX10-W32:       ; %bb.0: ; %main_body
 356 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 357 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 358 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 359 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 360 ; GFX10-W32-NEXT:    s_clause 0x1
 361 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 362 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 363 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 364 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 365 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 366 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 367 ; GFX10-W32-NEXT:    ; return to shader part epilog
 368 main_body:
 369   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 370   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 371   %out = fadd float %src0, %src1
 372   %out.0 = bitcast float %out to i32
 373   %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
 374   %out.2 = bitcast i32 %out.1 to float
 375   ret float %out.2
 376 }
 377
 378 define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
 379 ; GFX9-W64-LABEL: test6_ptr_buf:
 380 ; GFX9-W64:       ; %bb.0: ; %main_body
 381 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 382 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 383 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 384 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 385 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 386 ; GFX9-W64-NEXT:    s_nop 0
 387 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 388 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 389 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 390 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 391 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 392 ; GFX9-W64-NEXT:    ; return to shader part epilog
 393 ;
 394 ; GFX10-W32-LABEL: test6_ptr_buf:
 395 ; GFX10-W32:       ; %bb.0: ; %main_body
 396 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 397 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 398 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 399 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 400 ; GFX10-W32-NEXT:    s_clause 0x1
 401 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 402 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 403 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 404 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 405 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 406 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 407 ; GFX10-W32-NEXT:    ; return to shader part epilog
 408 main_body:
 409   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 410   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 411   %out = fadd float %src0, %src1
 412   %out.0 = bitcast float %out to i32
 413   %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
 414   %out.2 = bitcast i32 %out.1 to float
 415   ret float %out.2
 416 }
 417
 418 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
 419
 420 ; Check that WWM is triggered by the wwm intrinsic.
 421 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 422 ; GFX9-W64-LABEL: test_wwm1:
 423 ; GFX9-W64:       ; %bb.0: ; %main_body
 424 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 425 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 426 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 427 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 428 ; GFX9-W64-NEXT:    s_nop 0
 429 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 430 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 431 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
 432 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 433 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 434 ; GFX9-W64-NEXT:    ; return to shader part epilog
 435 ;
 436 ; GFX10-W32-LABEL: test_wwm1:
 437 ; GFX10-W32:       ; %bb.0: ; %main_body
 438 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 439 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 440 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 441 ; GFX10-W32-NEXT:    s_clause 0x1
 442 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 443 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 444 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 445 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
 446 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 447 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 448 ; GFX10-W32-NEXT:    ; return to shader part epilog
 449 main_body:
 450   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 451   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 452   %out = fadd float %src0, %src1
 453   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 454   ret float %out.0
 455 }
 456
 457 ; Same as above, but with an integer type.
 458 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
 459 ; GFX9-W64-LABEL: test_wwm2:
 460 ; GFX9-W64:       ; %bb.0: ; %main_body
 461 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 462 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 463 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 464 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 465 ; GFX9-W64-NEXT:    s_nop 0
 466 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 467 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 468 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
 469 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 470 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 471 ; GFX9-W64-NEXT:    ; return to shader part epilog
 472 ;
 473 ; GFX10-W32-LABEL: test_wwm2:
 474 ; GFX10-W32:       ; %bb.0: ; %main_body
 475 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 476 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 477 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 478 ; GFX10-W32-NEXT:    s_clause 0x1
 479 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 480 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 481 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 482 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 483 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 484 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 485 ; GFX10-W32-NEXT:    ; return to shader part epilog
 486 main_body:
 487   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 488   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 489   %src0.0 = bitcast float %src0 to i32
 490   %src1.0 = bitcast float %src1 to i32
 491   %out = add i32 %src0.0, %src1.0
 492   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
 493   %out.1 = bitcast i32 %out.0 to float
 494   ret float %out.1
 495 }
 496
 497 ; Check that we don't leave WWM on for computations that don't require WWM,
 498 ; since that will lead clobbering things that aren't supposed to be clobbered
 499 ; in cases like this.
 500 ; We enforce this by checking that v_add gets emitted in the same block as
 501 ; WWM computations.
 502 define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
 503 ; GFX9-W64-LABEL: test_wwm3:
 504 ; GFX9-W64:       ; %bb.0: ; %main_body
 505 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 506 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 507 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 508 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 509 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 510 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB13_2
 511 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 512 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
 513 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 514 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 515 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 516 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
 517 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 518 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 519 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
 520 ; GFX9-W64-NEXT:  .LBB13_2: ; %endif
 521 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 522 ; GFX9-W64-NEXT:    ; return to shader part epilog
 523 ;
 524 ; GFX10-W32-LABEL: test_wwm3:
 525 ; GFX10-W32:       ; %bb.0: ; %main_body
 526 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 527 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 528 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 529 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 530 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 531 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB13_2
 532 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 533 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 534 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 535 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 536 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 537 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
 538 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 539 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
 540 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
 541 ; GFX10-W32-NEXT:  .LBB13_2: ; %endif
 542 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 543 ; GFX10-W32-NEXT:    ; return to shader part epilog
 544 main_body:
 545   ; use mbcnt to make sure the branch is divergent
 546   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 547   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 548   %cc = icmp uge i32 %hi, 16
 549   br i1 %cc, label %endif, label %if
 550
 551 if:
 552   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 553   %out = fadd float %src, %src
 554   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 555   %out.1 = fadd float %src, %out.0
 556   br label %endif
 557
 558 endif:
 559   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
 560   ret float %out.2
 561 }
 562
 563 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
 564 ; write could clobber disabled channels in the non-WWM one.
 565 ; We enforce this by checking that v_mov gets emitted in the same block as
 566 ; WWM computations.
 567 define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
 568 ; GFX9-W64-LABEL: test_wwm4:
 569 ; GFX9-W64:       ; %bb.0: ; %main_body
 570 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 571 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 572 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 573 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 574 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 575 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB14_2
 576 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 577 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
 578 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 579 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 580 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 581 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
 582 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 583 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 584 ; GFX9-W64-NEXT:  .LBB14_2: ; %endif
 585 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 586 ; GFX9-W64-NEXT:    ; return to shader part epilog
 587 ;
 588 ; GFX10-W32-LABEL: test_wwm4:
 589 ; GFX10-W32:       ; %bb.0: ; %main_body
 590 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 591 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 592 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 593 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 594 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 595 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB14_2
 596 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 597 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 598 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 599 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 600 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 601 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
 602 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 603 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 604 ; GFX10-W32-NEXT:  .LBB14_2: ; %endif
 605 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 606 ; GFX10-W32-NEXT:    ; return to shader part epilog
 607 main_body:
 608   ; use mbcnt to make sure the branch is divergent
 609   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 610   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 611   %cc = icmp uge i32 %hi, 16
 612   br i1 %cc, label %endif, label %if
 613
 614 if:
 615   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 616   %out = fadd float %src, %src
 617   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 618   br label %endif
 619
 620 endif:
 621   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 622   ret float %out.1
 623 }
 624
 625 ; Make sure the transition from Exact to WWM then WQM works properly.
 626 define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
 627 ; GFX9-W64-LABEL: test_wwm5:
 628 ; GFX9-W64:       ; %bb.0: ; %main_body
 629 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 630 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 631 ; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
 632 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 633 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
 634 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
 635 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 636 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 637 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 638 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
 639 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 640 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 641 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 642 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
 643 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 644 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 645 ; GFX9-W64-NEXT:    ; return to shader part epilog
 646 ;
 647 ; GFX10-W32-LABEL: test_wwm5:
 648 ; GFX10-W32:       ; %bb.0: ; %main_body
 649 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 650 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 651 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
 652 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 653 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 654 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 655 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 656 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
 657 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 658 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 659 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 660 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
 661 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 662 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 663 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 664 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
 665 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 666 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 667 ; GFX10-W32-NEXT:    ; return to shader part epilog
 668 main_body:
 669   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 670   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 671   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 672   %temp = fadd float %src1, %src1
 673   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
 674   %out = fadd float %temp.0, %temp.0
 675   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 676   ret float %out.0
 677 }
 678
 679 ; Check that WWM is turned on correctly across basic block boundaries.
 680 ; if..then..endif version
 681 ;SI-CHECK: buffer_load_dword
 682 ;VI-CHECK: flat_load_dword
 683 ;SI-CHECK: buffer_load_dword
 684 ;VI-CHECK: flat_load_dword
 685 define amdgpu_ps float @test_wwm6_then() {
 686 ; GFX9-W64-LABEL: test_wwm6_then:
 687 ; GFX9-W64:       ; %bb.0: ; %main_body
 688 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 689 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
 690 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 691 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 692 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 693 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 694 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 695 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 696 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 697 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB16_2
 698 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 699 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 700 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
 701 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 702 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
 703 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 704 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 705 ; GFX9-W64-NEXT:  .LBB16_2: ; %endif
 706 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
 707 ; GFX9-W64-NEXT:    ; return to shader part epilog
 708 ;
 709 ; GFX10-W32-LABEL: test_wwm6_then:
 710 ; GFX10-W32:       ; %bb.0: ; %main_body
 711 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 712 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
 713 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 714 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 715 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 716 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 717 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 718 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 719 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 720 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB16_2
 721 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 722 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
 723 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
 724 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 725 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
 726 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
 727 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 728 ; GFX10-W32-NEXT:  .LBB16_2: ; %endif
 729 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 730 ; GFX10-W32-NEXT:    ; return to shader part epilog
 731 main_body:
 732   %src0 = load volatile float, ptr addrspace(1) undef
 733   ; use mbcnt to make sure the branch is divergent
 734   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 735   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 736   %cc = icmp uge i32 %hi, 16
 737   br i1 %cc, label %endif, label %if
 738
 739 if:
 740   %src1 = load volatile float, ptr addrspace(1) undef
 741   %out = fadd float %src0, %src1
 742   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 743   br label %endif
 744
 745 endif:
 746   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 747   ret float %out.1
 748 }
 749
 750 ; Check that WWM is turned on correctly across basic block boundaries.
 751 ; loop version
 752 ;SI-CHECK: buffer_load_dword
 753 ;VI-CHECK: flat_load_dword
 754 ;SI-CHECK: buffer_load_dword
 755 ;VI-CHECK: flat_load_dword
 756 define amdgpu_ps float @test_wwm6_loop() {
 757 ; GFX9-W64-LABEL: test_wwm6_loop:
 758 ; GFX9-W64:       ; %bb.0: ; %main_body
 759 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 760 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
 761 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 762 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 763 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 764 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 765 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
 766 ; GFX9-W64-NEXT:  .LBB17_1: ; %loop
 767 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 768 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 769 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
 770 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 771 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 772 ; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
 773 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 774 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 775 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
 776 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 777 ; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 778 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 779 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 780 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB17_1
 781 ; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
 782 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
 783 ; GFX9-W64-NEXT:    ; return to shader part epilog
 784 ;
 785 ; GFX10-W32-LABEL: test_wwm6_loop:
 786 ; GFX10-W32:       ; %bb.0: ; %main_body
 787 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 788 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
 789 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 790 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 791 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 792 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
 793 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 794 ; GFX10-W32-NEXT:  .LBB17_1: ; %loop
 795 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 796 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
 797 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
 798 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 799 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
 800 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 801 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
 802 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
 803 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
 804 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 805 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
 806 ; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
 807 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 808 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB17_1
 809 ; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
 810 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 811 ; GFX10-W32-NEXT:    ; return to shader part epilog
 812 main_body:
 813   %src0 = load volatile float, ptr addrspace(1) undef
 814   ; use mbcnt to make sure the branch is divergent
 815   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 816   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 817   br label %loop
 818
 819 loop:
 820   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
 821   %src1 = load volatile float, ptr addrspace(1) undef
 822   %out = fadd float %src0, %src1
 823   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 824   %counter.1 = sub i32 %counter, 1
 825   %cc = icmp ne i32 %counter.1, 0
 826   br i1 %cc, label %loop, label %endloop
 827
 828 endloop:
 829   ret float %out.0
 830 }
 831
 832 ; Check that @llvm.amdgcn.set.inactive disables WWM.
 833 define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
 834 ; GFX9-W64-LABEL: test_wwm_set_inactive1:
 835 ; GFX9-W64:       ; %bb.0: ; %main_body
 836 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 837 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 838 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 839 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 840 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
 841 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 842 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
 843 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 844 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
 845 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 846 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
 847 ; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 848 ; GFX9-W64-NEXT:    s_endpgm
 849 ;
 850 ; GFX10-W32-LABEL: test_wwm_set_inactive1:
 851 ; GFX10-W32:       ; %bb.0: ; %main_body
 852 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 853 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 854 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 855 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
 856 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 857 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 858 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 859 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 860 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
 861 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 862 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
 863 ; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 864 ; GFX10-W32-NEXT:    s_endpgm
 865 main_body:
 866   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 867   %src.0 = bitcast float %src to i32
 868   %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
 869   %out = add i32 %src.1, %src.1
 870   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
 871   %out.1 = bitcast i32 %out.0 to float
 872   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 873   ret void
 874 }
 875
 876 ; Check that Strict WQM is triggered by the strict_wqm intrinsic.
 877 define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
 878 ; GFX9-W64-LABEL: test_strict_wqm1:
 879 ; GFX9-W64:       ; %bb.0: ; %main_body
 880 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 881 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 882 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 883 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 884 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 885 ; GFX9-W64-NEXT:    s_nop 0
 886 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 887 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 888 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
 889 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 890 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 891 ; GFX9-W64-NEXT:    ; return to shader part epilog
 892 ;
 893 ; GFX10-W32-LABEL: test_strict_wqm1:
 894 ; GFX10-W32:       ; %bb.0: ; %main_body
 895 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 896 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 897 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 898 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 899 ; GFX10-W32-NEXT:    s_clause 0x1
 900 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 901 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 902 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 903 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
 904 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 905 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 906 ; GFX10-W32-NEXT:    ; return to shader part epilog
 907 main_body:
 908   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 909   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 910   %out = fadd float %src0, %src1
 911   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
 912   ret float %out.0
 913 }
 914
 915 ; Same as above, but with an integer type.
 916 define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
 917 ; GFX9-W64-LABEL: test_strict_wqm2:
 918 ; GFX9-W64:       ; %bb.0: ; %main_body
 919 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 920 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 921 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 922 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 923 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 924 ; GFX9-W64-NEXT:    s_nop 0
 925 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 926 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 927 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
 928 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 929 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 930 ; GFX9-W64-NEXT:    ; return to shader part epilog
 931 ;
 932 ; GFX10-W32-LABEL: test_strict_wqm2:
 933 ; GFX10-W32:       ; %bb.0: ; %main_body
 934 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 935 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 936 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 937 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 938 ; GFX10-W32-NEXT:    s_clause 0x1
 939 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 940 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 941 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 942 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 943 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 944 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 945 ; GFX10-W32-NEXT:    ; return to shader part epilog
 946 main_body:
 947   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 948   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 949   %src0.0 = bitcast float %src0 to i32
 950   %src1.0 = bitcast float %src1 to i32
 951   %out = add i32 %src0.0, %src1.0
 952   %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
 953   %out.1 = bitcast i32 %out.0 to float
 954   ret float %out.1
 955 }
 956
 957 ; Check that we don't leave Strict WQM on for computations that don't require it,
 958 ; since that will lead clobbering things that aren't supposed to be clobbered
 959 ; in cases like this.
 960 ; We enforce this by checking that v_add gets emitted in the same block as
 961 ; WWM computations.
 962 define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
 963 ; GFX9-W64-LABEL: test_strict_wqm3:
 964 ; GFX9-W64:       ; %bb.0: ; %main_body
 965 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 966 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 967 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 968 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 969 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 970 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB21_2
 971 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 972 ; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
 973 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 974 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 975 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 976 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 977 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
 978 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 979 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 980 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
 981 ; GFX9-W64-NEXT:  .LBB21_2: ; %endif
 982 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 983 ; GFX9-W64-NEXT:    ; return to shader part epilog
 984 ;
 985 ; GFX10-W32-LABEL: test_strict_wqm3:
 986 ; GFX10-W32:       ; %bb.0: ; %main_body
 987 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 988 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 989 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 990 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 991 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 992 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB21_2
 993 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 994 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 995 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 996 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 997 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 998 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 999 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
1000 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
1001 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1002 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
1003 ; GFX10-W32-NEXT:  .LBB21_2: ; %endif
1004 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1005 ; GFX10-W32-NEXT:    ; return to shader part epilog
1006 main_body:
1007   ; use mbcnt to make sure the branch is divergent
1008   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1009   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1010   %cc = icmp uge i32 %hi, 16
1011   br i1 %cc, label %endif, label %if
1012
1013 if:
1014   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1015   %out = fadd float %src, %src
1016   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1017   %out.1 = fadd float %src, %out.0
1018   br label %endif
1019
1020 endif:
1021   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
1022   ret float %out.2
1023 }
1024
1025 ; Check that Strict WQM writes aren't coalesced with non-strict writes, since
1026 ; the Strict WQM write could clobber disabled channels in the non-strict one.
1027 ; We enforce this by checking that v_mov gets emitted in the same block as
1028 ; WWM computations.
1029 define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
1030 ; GFX9-W64-LABEL: test_strict_wqm4:
1031 ; GFX9-W64:       ; %bb.0: ; %main_body
1032 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1033 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1034 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1035 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1036 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1037 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB22_2
1038 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
1039 ; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
1040 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1041 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
1042 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1043 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1044 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
1045 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
1046 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1047 ; GFX9-W64-NEXT:  .LBB22_2: ; %endif
1048 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
1049 ; GFX9-W64-NEXT:    ; return to shader part epilog
1050 ;
1051 ; GFX10-W32-LABEL: test_strict_wqm4:
1052 ; GFX10-W32:       ; %bb.0: ; %main_body
1053 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1054 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1055 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1056 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1057 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1058 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB22_2
1059 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
1060 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1061 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1062 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1063 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1064 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1065 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
1066 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
1067 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1068 ; GFX10-W32-NEXT:  .LBB22_2: ; %endif
1069 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1070 ; GFX10-W32-NEXT:    ; return to shader part epilog
1071 main_body:
1072   ; use mbcnt to make sure the branch is divergent
1073   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1074   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1075   %cc = icmp uge i32 %hi, 16
1076   br i1 %cc, label %endif, label %if
1077
1078 if:
1079   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1080   %out = fadd float %src, %src
1081   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1082   br label %endif
1083
1084 endif:
1085   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1086   ret float %out.1
1087 }
1088
1089 ; Make sure the transition from Exact to Strict WQM then WQM works properly.
1090 define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
1091 ; GFX9-W64-LABEL: test_strict_wqm5:
1092 ; GFX9-W64:       ; %bb.0: ; %main_body
1093 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1094 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
1095 ; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1096 ; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
1097 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1098 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1099 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1100 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
1101 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1102 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1103 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
1104 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
1105 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1106 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1107 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
1108 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1109 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1110 ; GFX9-W64-NEXT:    ; return to shader part epilog
1111 ;
1112 ; GFX10-W32-LABEL: test_strict_wqm5:
1113 ; GFX10-W32:       ; %bb.0: ; %main_body
1114 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
1115 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1116 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1117 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1118 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1119 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
1120 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1121 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1122 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1123 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1124 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1125 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1126 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1127 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
1128 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1129 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1130 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1131 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
1132 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1133 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1134 ; GFX10-W32-NEXT:    ; return to shader part epilog
1135 main_body:
1136   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1137   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1138   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1139   %temp = fadd float %src1, %src1
1140   %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
1141   %out = fadd float %temp.0, %temp.0
1142   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
1143   ret float %out.0
1144 }
1145
1146 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1147 ; if..then..endif version
1148 ;SI-CHECK: buffer_load_dword
1149 ;VI-CHECK: flat_load_dword
1150 ;SI-CHECK: buffer_load_dword
1151 ;VI-CHECK: flat_load_dword
1152 define amdgpu_ps float @test_strict_wqm6_then() {
1153 ; GFX9-W64-LABEL: test_strict_wqm6_then:
1154 ; GFX9-W64:       ; %bb.0: ; %main_body
1155 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1156 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1157 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1158 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1159 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1160 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1161 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1162 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1163 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1164 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1165 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB24_2
1166 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
1167 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1168 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1169 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1170 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1171 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
1172 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1173 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1174 ; GFX9-W64-NEXT:  .LBB24_2: ; %endif
1175 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1176 ; GFX9-W64-NEXT:    ; return to shader part epilog
1177 ;
1178 ; GFX10-W32-LABEL: test_strict_wqm6_then:
1179 ; GFX10-W32:       ; %bb.0: ; %main_body
1180 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1181 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1182 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1183 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1184 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1185 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1186 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1187 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1188 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1189 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1190 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
1191 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
1192 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1193 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1194 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1195 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1196 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
1197 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1198 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1199 ; GFX10-W32-NEXT:  .LBB24_2: ; %endif
1200 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1201 ; GFX10-W32-NEXT:    ; return to shader part epilog
1202 main_body:
1203   %src0 = load volatile float, ptr addrspace(1) undef
1204   ; use mbcnt to make sure the branch is divergent
1205   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1206   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1207   %cc = icmp uge i32 %hi, 16
1208   br i1 %cc, label %endif, label %if
1209
1210 if:
1211   %src1 = load volatile float, ptr addrspace(1) undef
1212   %out = fadd float %src0, %src1
1213   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1214   br label %endif
1215
1216 endif:
1217   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1218   ret float %out.1
1219 }
1220
1221 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1222 ; loop version
1223 ;SI-CHECK: buffer_load_dword
1224 ;VI-CHECK: flat_load_dword
1225 ;SI-CHECK: buffer_load_dword
1226 ;VI-CHECK: flat_load_dword
1227 define amdgpu_ps float @test_strict_wqm6_loop() {
1228 ; GFX9-W64-LABEL: test_strict_wqm6_loop:
1229 ; GFX9-W64:       ; %bb.0: ; %main_body
1230 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1231 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1232 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1233 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1234 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1235 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1236 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1237 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
1238 ; GFX9-W64-NEXT:  .LBB25_1: ; %loop
1239 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1240 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1241 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1242 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1243 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1244 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1245 ; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
1246 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1247 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1248 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1249 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
1250 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1251 ; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1252 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1253 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1254 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB25_1
1255 ; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
1256 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1257 ; GFX9-W64-NEXT:    ; return to shader part epilog
1258 ;
1259 ; GFX10-W32-LABEL: test_strict_wqm6_loop:
1260 ; GFX10-W32:       ; %bb.0: ; %main_body
1261 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1262 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1263 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1264 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1265 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1266 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1267 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
1268 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1269 ; GFX10-W32-NEXT:  .LBB25_1: ; %loop
1270 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1271 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1272 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1273 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1274 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1275 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1276 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
1277 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1278 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1279 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
1280 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1281 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1282 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1283 ; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
1284 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
1285 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB25_1
1286 ; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
1287 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1288 ; GFX10-W32-NEXT:    ; return to shader part epilog
1289 main_body:
1290   %src0 = load volatile float, ptr addrspace(1) undef
1291   ; use mbcnt to make sure the branch is divergent
1292   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1293   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1294   br label %loop
1295
1296 loop:
1297   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
1298   %src1 = load volatile float, ptr addrspace(1) undef
1299   %out = fadd float %src0, %src1
1300   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1301   %counter.1 = sub i32 %counter, 1
1302   %cc = icmp ne i32 %counter.1, 0
1303   br i1 %cc, label %loop, label %endloop
1304
1305 endloop:
1306   ret float %out.0
1307 }
1308
1309 ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
1310 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
1311 ; GFX9-W64-LABEL: test_set_inactive2:
1312 ; GFX9-W64:       ; %bb.0: ; %main_body
1313 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1314 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1315 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
1316 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s0
1317 ; GFX9-W64-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
1318 ; GFX9-W64-NEXT:    s_nop 0
1319 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
1320 ; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
1321 ; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
1322 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1323 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1324 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v2, v1
1325 ; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1326 ; GFX9-W64-NEXT:    s_endpgm
1327 ;
1328 ; GFX10-W32-LABEL: test_set_inactive2:
1329 ; GFX10-W32:       ; %bb.0: ; %main_body
1330 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1331 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1332 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s1
1333 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1334 ; GFX10-W32-NEXT:    s_clause 0x1
1335 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1336 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1337 ; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
1338 ; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
1339 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1340 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1341 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1342 ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1343 ; GFX10-W32-NEXT:    s_endpgm
1344 main_body:
1345   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1346   %src1.0 = bitcast float %src1 to i32
1347   %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
1348   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1349   %src0.0 = bitcast float %src0 to i32
1350   %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
1351   %out = add i32 %src0.1, %src1.1
1352   %out.0 = bitcast i32 %out to float
1353   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1354   ret void
1355 }
1356
1357 ; Check a case of one branch of an if-else requiring WQM, the other requiring
1358 ; exact.
1359 ; Note: In this particular case, the save-and-restore could be avoided if the
1360 ; analysis understood that the two branches of the if-else are mutually
1361 ; exclusive.
1362 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1363 ; GFX9-W64-LABEL: test_control_flow_0:
1364 ; GFX9-W64:       ; %bb.0: ; %main_body
1365 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1366 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1367 ; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1368 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1369 ; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1370 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_2
1371 ; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1372 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1373 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1374 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1375 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1376 ; GFX9-W64-NEXT:  .LBB27_2: ; %Flow
1377 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1378 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_4
1379 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1380 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1381 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1382 ; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1383 ; GFX9-W64-NEXT:  .LBB27_4: ; %END
1384 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1385 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1386 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1387 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1388 ; GFX9-W64-NEXT:    ; return to shader part epilog
1389 ;
1390 ; GFX10-W32-LABEL: test_control_flow_0:
1391 ; GFX10-W32:       ; %bb.0: ; %main_body
1392 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1393 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1394 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1395 ; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1396 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1397 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
1398 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1399 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1400 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1401 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1402 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1403 ; GFX10-W32-NEXT:  .LBB27_2: ; %Flow
1404 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1405 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_4
1406 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1407 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1408 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1409 ; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1410 ; GFX10-W32-NEXT:  .LBB27_4: ; %END
1411 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1412 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1413 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1414 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1415 ; GFX10-W32-NEXT:    ; return to shader part epilog
1416 main_body:
1417   %cmp = icmp eq i32 %z, 0
1418   br i1 %cmp, label %IF, label %ELSE
1419
1420 IF:
1421   %c.bc = bitcast i32 %c to float
1422   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1423   %tex0 = extractelement <4 x float> %tex, i32 0
1424   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1425   %data.if = extractelement <4 x float> %dtex, i32 0
1426   br label %END
1427
1428 ELSE:
1429   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1430   br label %END
1431
1432 END:
1433   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1434   ret float %r
1435 }
1436
1437 ; Reverse branch order compared to the previous test.
1438 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1439 ; GFX9-W64-LABEL: test_control_flow_1:
1440 ; GFX9-W64:       ; %bb.0: ; %main_body
1441 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1442 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1443 ; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1444 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1445 ; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1446 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB28_2
1447 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1448 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1449 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1450 ; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1451 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1452 ; GFX9-W64-NEXT:  .LBB28_2: ; %Flow
1453 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], s[14:15]
1454 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1455 ; GFX9-W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
1456 ; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
1457 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB28_4
1458 ; GFX9-W64-NEXT:  ; %bb.3: ; %ELSE
1459 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1460 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1461 ; GFX9-W64-NEXT:  .LBB28_4: ; %END
1462 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1463 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1464 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1465 ; GFX9-W64-NEXT:    ; return to shader part epilog
1466 ;
1467 ; GFX10-W32-LABEL: test_control_flow_1:
1468 ; GFX10-W32:       ; %bb.0: ; %main_body
1469 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1470 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1471 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1472 ; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1473 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1474 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB28_2
1475 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1476 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1477 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1478 ; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1479 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1480 ; GFX10-W32-NEXT:  .LBB28_2: ; %Flow
1481 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, s13
1482 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1483 ; GFX10-W32-NEXT:    s_and_b32 s0, exec_lo, s0
1484 ; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1485 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB28_4
1486 ; GFX10-W32-NEXT:  ; %bb.3: ; %ELSE
1487 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1488 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1489 ; GFX10-W32-NEXT:  .LBB28_4: ; %END
1490 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1491 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1492 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1493 ; GFX10-W32-NEXT:    ; return to shader part epilog
1494 main_body:
1495   %cmp = icmp eq i32 %z, 0
1496   br i1 %cmp, label %ELSE, label %IF
1497
1498 IF:
1499   %c.bc = bitcast i32 %c to float
1500   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1501   %tex0 = extractelement <4 x float> %tex, i32 0
1502   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1503   %data.if = extractelement <4 x float> %dtex, i32 0
1504   br label %END
1505
1506 ELSE:
1507   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1508   br label %END
1509
1510 END:
1511   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1512   ret float %r
1513 }
1514
1515 ; Check that branch conditions are properly marked as needing WQM...
1516 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
1517 ; GFX9-W64-LABEL: test_control_flow_2:
1518 ; GFX9-W64:       ; %bb.0: ; %main_body
1519 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1520 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1521 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1522 ; GFX9-W64-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1523 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1524 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1525 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1526 ; GFX9-W64-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1527 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1528 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
1529 ; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
1530 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1531 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1532 ; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1533 ; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1534 ; GFX9-W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1535 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr5
1536 ; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
1537 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1538 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1539 ; GFX9-W64-NEXT:    v_mul_lo_u32 v0, v5, 3
1540 ; GFX9-W64-NEXT:  ; %bb.4: ; %END
1541 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1542 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1543 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1544 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1545 ; GFX9-W64-NEXT:    ; return to shader part epilog
1546 ;
1547 ; GFX10-W32-LABEL: test_control_flow_2:
1548 ; GFX10-W32:       ; %bb.0: ; %main_body
1549 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1550 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1551 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1552 ; GFX10-W32-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1553 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1554 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1555 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1556 ; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v0
1557 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1558 ; GFX10-W32-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1559 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1560 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1561 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1562 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1563 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1564 ; GFX10-W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1565 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr5
1566 ; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
1567 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1568 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1569 ; GFX10-W32-NEXT:    v_mul_lo_u32 v0, v5, 3
1570 ; GFX10-W32-NEXT:  ; %bb.4: ; %END
1571 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1572 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1573 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1574 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1575 ; GFX10-W32-NEXT:    ; return to shader part epilog
1576 main_body:
1577   %idx.1 = extractelement <3 x i32> %idx, i32 0
1578   %data.1 = extractelement <2 x float> %data, i32 0
1579   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1580
1581   ; The load that determines the branch (and should therefore be WQM) is
1582   ; surrounded by stores that require disabled WQM.
1583   %idx.2 = extractelement <3 x i32> %idx, i32 1
1584   %z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0)
1585
1586   %idx.3 = extractelement <3 x i32> %idx, i32 2
1587   %data.3 = extractelement <2 x float> %data, i32 1
1588   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0)
1589
1590   %cc = fcmp ogt float %z, 0.0
1591   br i1 %cc, label %IF, label %ELSE
1592
1593 IF:
1594   %coord.IF = mul i32 %coord, 3
1595   br label %END
1596
1597 ELSE:
1598   %coord.ELSE = mul i32 %coord, 4
1599   br label %END
1600
1601 END:
1602   %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
1603   %coord.END.bc = bitcast i32 %coord.END to float
1604   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1605   ret <4 x float> %tex
1606 }
1607
1608 ; ... but only if they really do need it.
1609 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
1610 ; GFX9-W64-LABEL: test_control_flow_3:
1611 ; GFX9-W64:       ; %bb.0: ; %main_body
1612 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1613 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1614 ; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1615 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1616 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1617 ; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1618 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1619 ; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v1
1620 ; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1621 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1622 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1623 ; GFX9-W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1624 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB30_3
1625 ; GFX9-W64-NEXT:  ; %bb.1: ; %Flow
1626 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1627 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB30_4
1628 ; GFX9-W64-NEXT:  .LBB30_2: ; %END
1629 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1630 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1631 ; GFX9-W64-NEXT:    s_branch .LBB30_5
1632 ; GFX9-W64-NEXT:  .LBB30_3: ; %ELSE
1633 ; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1634 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr1
1635 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1636 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB30_2
1637 ; GFX9-W64-NEXT:  .LBB30_4: ; %IF
1638 ; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1639 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1640 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1641 ; GFX9-W64-NEXT:    s_branch .LBB30_5
1642 ; GFX9-W64-NEXT:  .LBB30_5:
1643 ;
1644 ; GFX10-W32-LABEL: test_control_flow_3:
1645 ; GFX10-W32:       ; %bb.0: ; %main_body
1646 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1647 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1648 ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1649 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1650 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1651 ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1652 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1653 ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1654 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1655 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1656 ; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e32 0, v1
1657 ; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
1658 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB30_3
1659 ; GFX10-W32-NEXT:  ; %bb.1: ; %Flow
1660 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1661 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB30_4
1662 ; GFX10-W32-NEXT:  .LBB30_2: ; %END
1663 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1664 ; GFX10-W32-NEXT:    s_branch .LBB30_5
1665 ; GFX10-W32-NEXT:  .LBB30_3: ; %ELSE
1666 ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1667 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr1
1668 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1669 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB30_2
1670 ; GFX10-W32-NEXT:  .LBB30_4: ; %IF
1671 ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1672 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1673 ; GFX10-W32-NEXT:    s_branch .LBB30_5
1674 ; GFX10-W32-NEXT:  .LBB30_5:
1675 main_body:
1676   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1677   %tex0 = extractelement <4 x float> %tex, i32 0
1678   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1679   %dtex.1 = extractelement <4 x float> %dtex, i32 0
1680   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1681
1682   %cc = fcmp ogt float %dtex.1, 0.0
1683   br i1 %cc, label %IF, label %ELSE
1684
1685 IF:
1686   %tex.IF = fmul float %dtex.1, 3.0
1687   br label %END
1688
1689 ELSE:
1690   %tex.ELSE = fmul float %dtex.1, 4.0
1691   br label %END
1692
1693 END:
1694   %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
1695   ret float %tex.END
1696 }
1697
1698 ; Another test that failed at some point because of terminator handling.
1699 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
1700 ; GFX9-W64-LABEL: test_control_flow_4:
1701 ; GFX9-W64:       ; %bb.0: ; %main_body
1702 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1703 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1704 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1705 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1706 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB31_2
1707 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1708 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1709 ; GFX9-W64-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1710 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 1
1711 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1712 ; GFX9-W64-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1713 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1714 ; GFX9-W64-NEXT:  .LBB31_2: ; %END
1715 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1716 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1717 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1718 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1719 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1720 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1721 ; GFX9-W64-NEXT:    ; return to shader part epilog
1722 ;
1723 ; GFX10-W32-LABEL: test_control_flow_4:
1724 ; GFX10-W32:       ; %bb.0: ; %main_body
1725 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1726 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1727 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1728 ; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
1729 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB31_2
1730 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1731 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1732 ; GFX10-W32-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1733 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 1
1734 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1735 ; GFX10-W32-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1736 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1737 ; GFX10-W32-NEXT:  .LBB31_2: ; %END
1738 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1739 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1740 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1741 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1742 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1743 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1744 ; GFX10-W32-NEXT:    ; return to shader part epilog
1745 main_body:
1746   %cond = icmp eq i32 %y, 0
1747   br i1 %cond, label %IF, label %END
1748
1749 IF:
1750   %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1751   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
1752   br label %END
1753
1754 END:
1755   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1756   %tex0 = extractelement <4 x float> %tex, i32 0
1757   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1758   ret <4 x float> %dtex
1759 }
1760
1761 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
1762 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
1763 ; GFX9-W64-LABEL: test_kill_0:
1764 ; GFX9-W64:       ; %bb.0: ; %main_body
1765 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1766 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1767 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1768 ; GFX9-W64-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
1769 ; GFX9-W64-NEXT:    s_nop 0
1770 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1771 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1772 ; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v6
1773 ; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1774 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB32_2
1775 ; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1776 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1777 ; GFX9-W64-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
1778 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1779 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1780 ; GFX9-W64-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
1781 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1782 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v7, v11
1783 ; GFX9-W64-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1784 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v8, v12
1785 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v9, v13
1786 ; GFX9-W64-NEXT:    v_add_f32_e32 v3, v10, v14
1787 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1788 ; GFX9-W64-NEXT:    s_branch .LBB32_3
1789 ; GFX9-W64-NEXT:  .LBB32_2:
1790 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1791 ; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1792 ; GFX9-W64-NEXT:    s_endpgm
1793 ; GFX9-W64-NEXT:  .LBB32_3:
1794 ;
1795 ; GFX10-W32-LABEL: test_kill_0:
1796 ; GFX10-W32:       ; %bb.0: ; %main_body
1797 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1798 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1799 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1800 ; GFX10-W32-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1801 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1802 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1803 ; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v6
1804 ; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1805 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB32_2
1806 ; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1807 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1808 ; GFX10-W32-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1809 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1810 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1811 ; GFX10-W32-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1812 ; GFX10-W32-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1813 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1814 ; GFX10-W32-NEXT:    v_add_f32_e32 v4, v8, v12
1815 ; GFX10-W32-NEXT:    v_add_f32_e32 v5, v10, v14
1816 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v7, v11
1817 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v9, v13
1818 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v4
1819 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v5
1820 ; GFX10-W32-NEXT:    s_branch .LBB32_3
1821 ; GFX10-W32-NEXT:  .LBB32_2:
1822 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1823 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1824 ; GFX10-W32-NEXT:    s_endpgm
1825 ; GFX10-W32-NEXT:  .LBB32_3:
1826 main_body:
1827   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1828   %idx.0 = extractelement <2 x i32> %idx, i32 0
1829   %data.0 = extractelement <2 x float> %data, i32 0
1830   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0)
1831
1832   %z.cmp = fcmp olt float %z, 0.0
1833   call void @llvm.amdgcn.kill(i1 %z.cmp)
1834
1835   %idx.1 = extractelement <2 x i32> %idx, i32 1
1836   %data.1 = extractelement <2 x float> %data, i32 1
1837   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1838   %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1839   %tex2.0 = extractelement <4 x float> %tex2, i32 0
1840   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1841   %out = fadd <4 x float> %tex, %dtex
1842
1843   ret <4 x float> %out
1844 }
1845
1846 ; ... but only if WQM is necessary.
1847 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
1848 ; GFX9-W64-LABEL: test_kill_1:
1849 ; GFX9-W64:       ; %bb.0: ; %main_body
1850 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1851 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v2
1852 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1853 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v0
1854 ; GFX9-W64-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1855 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1856 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1857 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1858 ; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v4
1859 ; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1860 ; GFX9-W64-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1861 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB33_2
1862 ; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1863 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1864 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1865 ; GFX9-W64-NEXT:    s_branch .LBB33_3
1866 ; GFX9-W64-NEXT:  .LBB33_2:
1867 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1868 ; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1869 ; GFX9-W64-NEXT:    s_endpgm
1870 ; GFX9-W64-NEXT:  .LBB33_3:
1871 ;
1872 ; GFX10-W32-LABEL: test_kill_1:
1873 ; GFX10-W32:       ; %bb.0: ; %main_body
1874 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1875 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v2
1876 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1877 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v0
1878 ; GFX10-W32-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1879 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1880 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1881 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1882 ; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v4
1883 ; GFX10-W32-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1884 ; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1885 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB33_2
1886 ; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1887 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1888 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1889 ; GFX10-W32-NEXT:    s_branch .LBB33_3
1890 ; GFX10-W32-NEXT:  .LBB33_2:
1891 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1892 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1893 ; GFX10-W32-NEXT:    s_endpgm
1894 ; GFX10-W32-NEXT:  .LBB33_3:
1895 main_body:
1896   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1897   %tex0 = extractelement <4 x float> %tex, i32 0
1898   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1899
1900   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1901
1902   %z.cmp = fcmp olt float %z, 0.0
1903   call void @llvm.amdgcn.kill(i1 %z.cmp)
1904
1905   ret <4 x float> %dtex
1906 }
1907
1908 ; Check prolog shaders.
1909 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
1910 ; GFX9-W64-LABEL: test_prolog_1:
1911 ; GFX9-W64:       ; %bb.0: ; %main_body
1912 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1913 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1914 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
1915 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1916 ; GFX9-W64-NEXT:    ; return to shader part epilog
1917 ;
1918 ; GFX10-W32-LABEL: test_prolog_1:
1919 ; GFX10-W32:       ; %bb.0: ; %main_body
1920 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1921 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1922 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
1923 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1924 ; GFX10-W32-NEXT:    ; return to shader part epilog
1925 main_body:
1926   %s = fadd float %a, %b
1927   ret float %s
1928 }
1929
1930 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
1931 ; GFX9-W64-LABEL: test_loop_vcc:
1932 ; GFX9-W64:       ; %bb.0: ; %entry
1933 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1934 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1935 ; GFX9-W64-NEXT:    v_mov_b32_e32 v7, v3
1936 ; GFX9-W64-NEXT:    v_mov_b32_e32 v6, v2
1937 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v1
1938 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
1939 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1940 ; GFX9-W64-NEXT:    image_store v[4:7], v0, s[0:7] dmask:0xf unorm
1941 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1942 ; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
1943 ; GFX9-W64-NEXT:    s_mov_b32 s4, 0x40e00000
1944 ; GFX9-W64-NEXT:    s_branch .LBB35_2
1945 ; GFX9-W64-NEXT:  .LBB35_1: ; %body
1946 ; GFX9-W64-NEXT:    ; in Loop: Header=BB35_2 Depth=1
1947 ; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
1948 ; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
1949 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_4
1950 ; GFX9-W64-NEXT:  .LBB35_2: ; %loop
1951 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1952 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1953 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
1954 ; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v8
1955 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v5
1956 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
1957 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
1958 ; GFX9-W64-NEXT:    s_cbranch_vccz .LBB35_1
1959 ; GFX9-W64-NEXT:  ; %bb.3:
1960 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1961 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
1962 ; GFX9-W64-NEXT:  .LBB35_4: ; %break
1963 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1964 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1965 ; GFX9-W64-NEXT:    ; return to shader part epilog
1966 ;
1967 ; GFX10-W32-LABEL: test_loop_vcc:
1968 ; GFX10-W32:       ; %bb.0: ; %entry
1969 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1970 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1971 ; GFX10-W32-NEXT:    v_mov_b32_e32 v8, 0
1972 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1973 ; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
1974 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1975 ; GFX10-W32-NEXT:    s_branch .LBB35_2
1976 ; GFX10-W32-NEXT:    .p2align 6
1977 ; GFX10-W32-NEXT:  .LBB35_1: ; %body
1978 ; GFX10-W32-NEXT:    ; in Loop: Header=BB35_2 Depth=1
1979 ; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1980 ; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
1981 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_4
1982 ; GFX10-W32-NEXT:  .LBB35_2: ; %loop
1983 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1984 ; GFX10-W32-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1985 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1986 ; GFX10-W32-NEXT:    v_mov_b32_e32 v7, v3
1987 ; GFX10-W32-NEXT:    v_mov_b32_e32 v6, v2
1988 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v1
1989 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
1990 ; GFX10-W32-NEXT:    s_cbranch_vccz .LBB35_1
1991 ; GFX10-W32-NEXT:  ; %bb.3:
1992 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1993 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
1994 ; GFX10-W32-NEXT:  .LBB35_4: ; %break
1995 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1996 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1997 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
1998 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v5
1999 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v6
2000 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v7
2001 ; GFX10-W32-NEXT:    ; return to shader part epilog
2002 entry:
2003   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
2004   br label %loop
2005
2006 loop:
2007   %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
2008   %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
2009   %cc = fcmp ogt float %ctr.iv, 7.0
2010   br i1 %cc, label %break, label %body
2011
2012 body:
2013   %c.iv0 = extractelement <4 x float> %c.iv, i32 0
2014   %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2015   %ctr.next = fadd float %ctr.iv, 2.0
2016   br label %loop
2017
2018 break:
2019   ret <4 x float> %c.iv
2020 }
2021
2022 ; Only intrinsic stores need exact execution -- other stores do not have
2023 ; externally visible effects and may require WQM for correctness.
2024 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
2025 ; GFX9-W64-LABEL: test_alloca:
2026 ; GFX9-W64:       ; %bb.0: ; %entry
2027 ; GFX9-W64-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2028 ; GFX9-W64-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2029 ; GFX9-W64-NEXT:    s_mov_b32 s10, -1
2030 ; GFX9-W64-NEXT:    s_mov_b32 s11, 0xe00000
2031 ; GFX9-W64-NEXT:    s_add_u32 s8, s8, s0
2032 ; GFX9-W64-NEXT:    s_addc_u32 s9, s9, 0
2033 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2034 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2035 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2036 ; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2037 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2038 ; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
2039 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2040 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 4
2041 ; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
2042 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
2043 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2044 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2045 ; GFX9-W64-NEXT:    image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf
2046 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1
2047 ; GFX9-W64-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 idxen
2048 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
2049 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2050 ; GFX9-W64-NEXT:    s_endpgm
2051 ;
2052 ; GFX10-W32-LABEL: test_alloca:
2053 ; GFX10-W32:       ; %bb.0: ; %entry
2054 ; GFX10-W32-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2055 ; GFX10-W32-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2056 ; GFX10-W32-NEXT:    s_mov_b32 s10, -1
2057 ; GFX10-W32-NEXT:    s_mov_b32 s11, 0x31c16000
2058 ; GFX10-W32-NEXT:    s_add_u32 s8, s8, s0
2059 ; GFX10-W32-NEXT:    s_addc_u32 s9, s9, 0
2060 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2061 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2062 ; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 4
2063 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2064 ; GFX10-W32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2065 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2066 ; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
2067 ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2068 ; GFX10-W32-NEXT:    buffer_load_dword v1, v2, s[8:11], 0 offen
2069 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2070 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2071 ; GFX10-W32-NEXT:    image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2072 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1
2073 ; GFX10-W32-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 idxen
2074 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2075 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2076 ; GFX10-W32-NEXT:    s_endpgm
2077 entry:
2078   %array = alloca [32 x i32], align 4, addrspace(5)
2079
2080   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2081
2082   store volatile i32 %a, ptr addrspace(5) %array, align 4
2083
2084   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
2085
2086   %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx
2087   %c = load i32, ptr addrspace(5) %c.gep, align 4
2088   %c.bc = bitcast i32 %c to float
2089   %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2090   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2091
2092   ret void
2093 }
2094
2095 ; Must return to exact at the end of a non-void returning shader,
2096 ; otherwise the EXEC mask exported by the epilog will be wrong. This is true
2097 ; even if the shader has no kills, because a kill could have happened in a
2098 ; previous shader fragment.
2099 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
2100 ; GFX9-W64-LABEL: test_nonvoid_return:
2101 ; GFX9-W64:       ; %bb.0:
2102 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2103 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2104 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2105 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2106 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2107 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2108 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2109 ; GFX9-W64-NEXT:    ; return to shader part epilog
2110 ;
2111 ; GFX10-W32-LABEL: test_nonvoid_return:
2112 ; GFX10-W32:       ; %bb.0:
2113 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2114 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2115 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2116 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2117 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2118 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2119 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2120 ; GFX10-W32-NEXT:    ; return to shader part epilog
2121   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2122   %tex0 = extractelement <4 x float> %tex, i32 0
2123   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2124   ret <4 x float> %dtex
2125 }
2126
2127 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
2128 ; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
2129 ; GFX9-W64:       ; %bb.0: ; %entry
2130 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2131 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2132 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, exec
2133 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2134 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2135 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2136 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB38_2
2137 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
2138 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2139 ; GFX9-W64-NEXT:    s_branch .LBB38_3
2140 ; GFX9-W64-NEXT:  .LBB38_2: ; %if
2141 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2142 ; GFX9-W64-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2143 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2144 ; GFX9-W64-NEXT:  .LBB38_3:
2145 ;
2146 ; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
2147 ; GFX10-W32:       ; %bb.0: ; %entry
2148 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2149 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2150 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, exec_lo
2151 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2152 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2153 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2154 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB38_2
2155 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
2156 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2157 ; GFX10-W32-NEXT:    s_branch .LBB38_3
2158 ; GFX10-W32-NEXT:  .LBB38_2: ; %if
2159 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2160 ; GFX10-W32-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2161 ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2162 ; GFX10-W32-NEXT:  .LBB38_3:
2163 entry:
2164   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2165   %tex0 = extractelement <4 x float> %tex, i32 0
2166   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2167   %cc = icmp sgt i32 %c, 0
2168   br i1 %cc, label %if, label %else
2169
2170 if:
2171   store volatile <4 x float> %dtex, ptr addrspace(1) undef
2172   unreachable
2173
2174 else:
2175   ret <4 x float> %dtex
2176 }
2177
2178 ; Test awareness that s_wqm_b64 clobbers SCC.
2179 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
2180 ; GFX9-W64-LABEL: test_scc:
2181 ; GFX9-W64:       ; %bb.0: ; %main_body
2182 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2183 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
2184 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2185 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2186 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB39_2
2187 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
2188 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2189 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
2190 ; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
2191 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB39_3
2192 ; GFX9-W64-NEXT:    s_branch .LBB39_4
2193 ; GFX9-W64-NEXT:  .LBB39_2:
2194 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2195 ; GFX9-W64-NEXT:  .LBB39_3: ; %if
2196 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2197 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2198 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2199 ; GFX9-W64-NEXT:  .LBB39_4: ; %end
2200 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2201 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
2202 ; GFX9-W64-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2203 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2204 ; GFX9-W64-NEXT:    ; return to shader part epilog
2205 ;
2206 ; GFX10-W32-LABEL: test_scc:
2207 ; GFX10-W32:       ; %bb.0: ; %main_body
2208 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
2209 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
2210 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2211 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2212 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB39_2
2213 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
2214 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2215 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
2216 ; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
2217 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_3
2218 ; GFX10-W32-NEXT:    s_branch .LBB39_4
2219 ; GFX10-W32-NEXT:  .LBB39_2:
2220 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2221 ; GFX10-W32-NEXT:  .LBB39_3: ; %if
2222 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2223 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2224 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2225 ; GFX10-W32-NEXT:  .LBB39_4: ; %end
2226 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
2227 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
2228 ; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2229 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2230 ; GFX10-W32-NEXT:    ; return to shader part epilog
2231 main_body:
2232   %cc = icmp sgt i32 %sel, 0
2233   br i1 %cc, label %if, label %else
2234
2235 if:
2236   %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2237   br label %end
2238
2239 else:
2240   %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2241   br label %end
2242
2243 end:
2244   %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
2245   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2246   ret <4 x float> %r
2247 }
2248
2249 ; Check a case of a block being entirely WQM except for a bit of WWM.
2250 ; There was a bug where it forgot to enter and leave WWM.
2251 define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2252 ; GFX9-W64-LABEL: test_wwm_within_wqm:
2253 ; GFX9-W64:       ; %bb.0: ; %main_body
2254 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2255 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2256 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2257 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2258 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2259 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB40_2
2260 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2261 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2262 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2263 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2264 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2265 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2266 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2267 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
2268 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
2269 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
2270 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2271 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2272 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2273 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2274 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2275 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2276 ; GFX9-W64-NEXT:  .LBB40_2: ; %ENDIF
2277 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2278 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2279 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2280 ; GFX9-W64-NEXT:    ; return to shader part epilog
2281 ;
2282 ; GFX10-W32-LABEL: test_wwm_within_wqm:
2283 ; GFX10-W32:       ; %bb.0: ; %main_body
2284 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2285 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2286 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2287 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2288 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2289 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB40_2
2290 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2291 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2292 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2293 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2294 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2295 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2296 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2297 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2298 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
2299 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2300 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2301 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2302 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2303 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2304 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2305 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2306 ; GFX10-W32-NEXT:  .LBB40_2: ; %ENDIF
2307 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2308 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2309 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2310 ; GFX10-W32-NEXT:    ; return to shader part epilog
2311 main_body:
2312   %cmp = icmp eq i32 %z, 0
2313   br i1 %cmp, label %IF, label %ENDIF
2314
2315 IF:
2316   %c.bc = bitcast i32 %c to float
2317   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2318   %tex0 = extractelement <4 x float> %tex, i32 0
2319   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2320   %dataf = extractelement <4 x float> %dtex, i32 0
2321   %data1 = fptosi float %dataf to i32
2322   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2323   %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2324   %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
2325   %data4f = sitofp i32 %data4 to float
2326   br label %ENDIF
2327
2328 ENDIF:
2329   %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2330   ret float %r
2331 }
2332
2333 ; Check that WWM is triggered by the strict_wwm intrinsic.
2334 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
2335 ; GFX9-W64-LABEL: test_strict_wwm1:
2336 ; GFX9-W64:       ; %bb.0: ; %main_body
2337 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2338 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2339 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2340 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2341 ; GFX9-W64-NEXT:    s_nop 0
2342 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2343 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2344 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2345 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2346 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2347 ; GFX9-W64-NEXT:    ; return to shader part epilog
2348 ;
2349 ; GFX10-W32-LABEL: test_strict_wwm1:
2350 ; GFX10-W32:       ; %bb.0: ; %main_body
2351 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2352 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2353 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2354 ; GFX10-W32-NEXT:    s_clause 0x1
2355 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2356 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2357 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2358 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2359 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2360 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2361 ; GFX10-W32-NEXT:    ; return to shader part epilog
2362 main_body:
2363   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2364   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2365   %out = fadd float %src0, %src1
2366   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2367   ret float %out.0
2368 }
2369
2370 ; Same as above, but with an integer type.
2371 define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
2372 ; GFX9-W64-LABEL: test_strict_wwm2:
2373 ; GFX9-W64:       ; %bb.0: ; %main_body
2374 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2375 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2376 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2377 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2378 ; GFX9-W64-NEXT:    s_nop 0
2379 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2380 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2381 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
2382 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2383 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2384 ; GFX9-W64-NEXT:    ; return to shader part epilog
2385 ;
2386 ; GFX10-W32-LABEL: test_strict_wwm2:
2387 ; GFX10-W32:       ; %bb.0: ; %main_body
2388 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2389 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2390 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2391 ; GFX10-W32-NEXT:    s_clause 0x1
2392 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2393 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2394 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2395 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2396 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2397 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2398 ; GFX10-W32-NEXT:    ; return to shader part epilog
2399 main_body:
2400   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2401   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2402   %src0.0 = bitcast float %src0 to i32
2403   %src1.0 = bitcast float %src1 to i32
2404   %out = add i32 %src0.0, %src1.0
2405   %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2406   %out.1 = bitcast i32 %out.0 to float
2407   ret float %out.1
2408 }
2409
2410 ; Check that we don't leave WWM on for computations that don't require WWM,
2411 ; since that will lead clobbering things that aren't supposed to be clobbered
2412 ; in cases like this.
2413 ; We enforce this by checking that v_add gets emitted in the same block as
2414 ; WWM computations.
2415 define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
2416 ; GFX9-W64-LABEL: test_strict_wwm3:
2417 ; GFX9-W64:       ; %bb.0: ; %main_body
2418 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2419 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2420 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2421 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2422 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2423 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB43_2
2424 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
2425 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2426 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2427 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2428 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2429 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
2430 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2431 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2432 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
2433 ; GFX9-W64-NEXT:  .LBB43_2: ; %endif
2434 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2435 ; GFX9-W64-NEXT:    ; return to shader part epilog
2436 ;
2437 ; GFX10-W32-LABEL: test_strict_wwm3:
2438 ; GFX10-W32:       ; %bb.0: ; %main_body
2439 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2440 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2441 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2442 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2443 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2444 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB43_2
2445 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
2446 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2447 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2448 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2449 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2450 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
2451 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2452 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2453 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
2454 ; GFX10-W32-NEXT:  .LBB43_2: ; %endif
2455 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2456 ; GFX10-W32-NEXT:    ; return to shader part epilog
2457 main_body:
2458   ; use mbcnt to make sure the branch is divergent
2459   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2460   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2461   %cc = icmp uge i32 %hi, 16
2462   br i1 %cc, label %endif, label %if
2463
2464 if:
2465   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2466   %out = fadd float %src, %src
2467   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2468   %out.1 = fadd float %src, %out.0
2469   br label %endif
2470
2471 endif:
2472   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2473   ret float %out.2
2474 }
2475
2476 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
2477 ; write could clobber disabled channels in the non-WWM one.
2478 ; We enforce this by checking that v_mov gets emitted in the same block as
2479 ; WWM computations.
2480 define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
2481 ; GFX9-W64-LABEL: test_strict_wwm4:
2482 ; GFX9-W64:       ; %bb.0: ; %main_body
2483 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2484 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2485 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2486 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2487 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2488 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB44_2
2489 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
2490 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2491 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2492 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2493 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2494 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2495 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2496 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2497 ; GFX9-W64-NEXT:  .LBB44_2: ; %endif
2498 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2499 ; GFX9-W64-NEXT:    ; return to shader part epilog
2500 ;
2501 ; GFX10-W32-LABEL: test_strict_wwm4:
2502 ; GFX10-W32:       ; %bb.0: ; %main_body
2503 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2504 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2505 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2506 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2507 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2508 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB44_2
2509 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
2510 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2511 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2512 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2513 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2514 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2515 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2516 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2517 ; GFX10-W32-NEXT:  .LBB44_2: ; %endif
2518 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2519 ; GFX10-W32-NEXT:    ; return to shader part epilog
2520 main_body:
2521   ; use mbcnt to make sure the branch is divergent
2522   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2523   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2524   %cc = icmp uge i32 %hi, 16
2525   br i1 %cc, label %endif, label %if
2526
2527 if:
2528   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2529   %out = fadd float %src, %src
2530   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2531   br label %endif
2532
2533 endif:
2534   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2535   ret float %out.1
2536 }
2537
2538 ; Make sure the transition from Exact to WWM then WQM works properly.
2539 define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
2540 ; GFX9-W64-LABEL: test_strict_wwm5:
2541 ; GFX9-W64:       ; %bb.0: ; %main_body
2542 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2543 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
2544 ; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2545 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2546 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2547 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2548 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
2549 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2550 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2551 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2552 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2553 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2554 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2555 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
2556 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2557 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2558 ; GFX9-W64-NEXT:    ; return to shader part epilog
2559 ;
2560 ; GFX10-W32-LABEL: test_strict_wwm5:
2561 ; GFX10-W32:       ; %bb.0: ; %main_body
2562 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
2563 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
2564 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2565 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2566 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
2567 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2568 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2569 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2570 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2571 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2572 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2573 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2574 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2575 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2576 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2577 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
2578 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2579 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
2580 ; GFX10-W32-NEXT:    ; return to shader part epilog
2581 main_body:
2582   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2583   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2584   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2585   %temp = fadd float %src1, %src1
2586   %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
2587   %out = fadd float %temp.0, %temp.0
2588   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
2589   ret float %out.0
2590 }
2591
2592 ; Check that WWM is turned on correctly across basic block boundaries.
2593 ; if..then..endif version
2594 ;SI-CHECK: buffer_load_dword
2595 ;VI-CHECK: flat_load_dword
2596 ;SI-CHECK: buffer_load_dword
2597 ;VI-CHECK: flat_load_dword
2598 define amdgpu_ps float @test_strict_wwm6_then() {
2599 ; GFX9-W64-LABEL: test_strict_wwm6_then:
2600 ; GFX9-W64:       ; %bb.0: ; %main_body
2601 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2602 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2603 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2604 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2605 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2606 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2607 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2608 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2609 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2610 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB46_2
2611 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
2612 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2613 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2614 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2615 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2616 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2617 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2618 ; GFX9-W64-NEXT:  .LBB46_2: ; %endif
2619 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2620 ; GFX9-W64-NEXT:    ; return to shader part epilog
2621 ;
2622 ; GFX10-W32-LABEL: test_strict_wwm6_then:
2623 ; GFX10-W32:       ; %bb.0: ; %main_body
2624 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2625 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2626 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2627 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2628 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2629 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2630 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2631 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2632 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2633 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
2634 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
2635 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2636 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2637 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2638 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2639 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2640 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2641 ; GFX10-W32-NEXT:  .LBB46_2: ; %endif
2642 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2643 ; GFX10-W32-NEXT:    ; return to shader part epilog
2644 main_body:
2645   %src0 = load volatile float, ptr addrspace(1) undef
2646   ; use mbcnt to make sure the branch is divergent
2647   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2648   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2649   %cc = icmp uge i32 %hi, 16
2650   br i1 %cc, label %endif, label %if
2651
2652 if:
2653   %src1 = load volatile float, ptr addrspace(1) undef
2654   %out = fadd float %src0, %src1
2655   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2656   br label %endif
2657
2658 endif:
2659   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2660   ret float %out.1
2661 }
2662
2663 ; Check that WWM is turned on correctly across basic block boundaries.
2664 ; loop version
2665 define amdgpu_ps float @test_strict_wwm6_loop() {
2666 ; GFX9-W64-LABEL: test_strict_wwm6_loop:
2667 ; GFX9-W64:       ; %bb.0: ; %main_body
2668 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2669 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2670 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2671 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2672 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2673 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2674 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
2675 ; GFX9-W64-NEXT:  .LBB47_1: ; %loop
2676 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2677 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2678 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2679 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2680 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2681 ; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
2682 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2683 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2684 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
2685 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2686 ; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2687 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2688 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2689 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB47_1
2690 ; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
2691 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2692 ; GFX9-W64-NEXT:    ; return to shader part epilog
2693 ;
2694 ; GFX10-W32-LABEL: test_strict_wwm6_loop:
2695 ; GFX10-W32:       ; %bb.0: ; %main_body
2696 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2697 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2698 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2699 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2700 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2701 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2702 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2703 ; GFX10-W32-NEXT:  .LBB47_1: ; %loop
2704 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2705 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2706 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2707 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2708 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2709 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
2710 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2711 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
2712 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2713 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2714 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2715 ; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
2716 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
2717 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB47_1
2718 ; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
2719 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2720 ; GFX10-W32-NEXT:    ; return to shader part epilog
2721 main_body:
2722   %src0 = load volatile float, ptr addrspace(1) undef
2723   ; use mbcnt to make sure the branch is divergent
2724   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2725   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2726   br label %loop
2727
2728 loop:
2729   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
2730   %src1 = load volatile float, ptr addrspace(1) undef
2731   %out = fadd float %src0, %src1
2732   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2733   %counter.1 = sub i32 %counter, 1
2734   %cc = icmp ne i32 %counter.1, 0
2735   br i1 %cc, label %loop, label %endloop
2736
2737 endloop:
2738   ret float %out.0
2739 }
2740
2741 ; Check that @llvm.amdgcn.set.inactive disables WWM.
2742 define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
2743 ; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
2744 ; GFX9-W64:       ; %bb.0: ; %main_body
2745 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2746 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2747 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2748 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2749 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
2750 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2751 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
2752 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2753 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
2754 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2755 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2756 ; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2757 ; GFX9-W64-NEXT:    s_endpgm
2758 ;
2759 ; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
2760 ; GFX10-W32:       ; %bb.0: ; %main_body
2761 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2762 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2763 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2764 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2765 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2766 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2767 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2768 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2769 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
2770 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2771 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2772 ; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2773 ; GFX10-W32-NEXT:    s_endpgm
2774 main_body:
2775   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2776   %src.0 = bitcast float %src to i32
2777   %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
2778   %out = add i32 %src.1, %src.1
2779   %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2780   %out.1 = bitcast i32 %out.0 to float
2781   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2782   ret void
2783 }
2784
2785 ; Check a case of a block being entirely WQM except for a bit of WWM.
2786 ; There was a bug where it forgot to enter and leave WWM.
2787 define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2788 ; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
2789 ; GFX9-W64:       ; %bb.0: ; %main_body
2790 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2791 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2792 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2793 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2794 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2795 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB49_2
2796 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2797 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2798 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2799 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2800 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2801 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2802 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2803 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
2804 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
2805 ; GFX9-W64-NEXT:    s_not_b64 exec, exec
2806 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2807 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2808 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2809 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2810 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2811 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2812 ; GFX9-W64-NEXT:  .LBB49_2: ; %ENDIF
2813 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2814 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2815 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2816 ; GFX9-W64-NEXT:    ; return to shader part epilog
2817 ;
2818 ; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
2819 ; GFX10-W32:       ; %bb.0: ; %main_body
2820 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2821 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2822 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2823 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2824 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2825 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB49_2
2826 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2827 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2828 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2829 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2830 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2831 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2832 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2833 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2834 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
2835 ; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2836 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2837 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2838 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2839 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2840 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2841 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2842 ; GFX10-W32-NEXT:  .LBB49_2: ; %ENDIF
2843 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2844 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2845 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2846 ; GFX10-W32-NEXT:    ; return to shader part epilog
2847 main_body:
2848   %cmp = icmp eq i32 %z, 0
2849   br i1 %cmp, label %IF, label %ENDIF
2850
2851 IF:
2852   %c.bc = bitcast i32 %c to float
2853   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2854   %tex0 = extractelement <4 x float> %tex, i32 0
2855   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2856   %dataf = extractelement <4 x float> %dtex, i32 0
2857   %data1 = fptosi float %dataf to i32
2858   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2859   %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2860   %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
2861   %data4f = sitofp i32 %data4 to float
2862   br label %ENDIF
2863
2864 ENDIF:
2865   %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2866   ret float %r
2867 }
2868
2869 ; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
2870 define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2871 ; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
2872 ; GFX9-W64:       ; %bb.0: ; %main_body
2873 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2874 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2875 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2876 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2877 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2878 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2879 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB50_2
2880 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2881 ; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2882 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2883 ; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2884 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2885 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v2
2886 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2887 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2888 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2889 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v0, v0
2890 ; GFX9-W64-NEXT:  .LBB50_2: ; %ENDIF
2891 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2892 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2893 ; GFX9-W64-NEXT:    ; return to shader part epilog
2894 ;
2895 ; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
2896 ; GFX10-W32:       ; %bb.0: ; %main_body
2897 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2898 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2899 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2900 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2901 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2902 ; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
2903 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB50_2
2904 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2905 ; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2906 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2907 ; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2908 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2909 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v2
2910 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2911 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2912 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2913 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v0, v0
2914 ; GFX10-W32-NEXT:  .LBB50_2: ; %ENDIF
2915 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2916 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2917 ; GFX10-W32-NEXT:    ; return to shader part epilog
2918 main_body:
2919   %cmp = icmp eq i32 %z, 0
2920   br i1 %cmp, label %IF, label %ENDIF
2921
2922 IF:
2923   %c.bc = bitcast i32 %c to float
2924   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2925   %tex0 = extractelement <4 x float> %tex, i32 0
2926   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2927   %dataf = extractelement <4 x float> %dtex, i32 0
2928   %data1 = fptosi float %dataf to i32
2929   %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
2930   %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
2931   %data3f = sitofp i32 %data3 to float
2932   br label %ENDIF
2933
2934 ENDIF:
2935   %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
2936   ret float %r
2937 }
2938
2939 ;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
2940 define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) {
2941 ; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
2942 ; GFX9-W64:       ; %bb.0: ; %main_body
2943 ; GFX9-W64-NEXT:    s_mov_b64 s[28:29], exec
2944 ; GFX9-W64-NEXT:    s_mov_b32 s19, s17
2945 ; GFX9-W64-NEXT:    s_mov_b64 s[30:31], exec
2946 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2947 ; GFX9-W64-NEXT:    s_mov_b32 s23, s5
2948 ; GFX9-W64-NEXT:    s_mov_b32 s22, s4
2949 ; GFX9-W64-NEXT:    s_mov_b32 s21, s3
2950 ; GFX9-W64-NEXT:    s_mov_b32 s20, s2
2951 ; GFX9-W64-NEXT:    s_mov_b32 s27, s9
2952 ; GFX9-W64-NEXT:    s_mov_b32 s26, s8
2953 ; GFX9-W64-NEXT:    s_mov_b32 s25, s7
2954 ; GFX9-W64-NEXT:    s_mov_b32 s24, s6
2955 ; GFX9-W64-NEXT:    s_mov_b32 s18, s16
2956 ; GFX9-W64-NEXT:    s_mov_b32 s17, s15
2957 ; GFX9-W64-NEXT:    s_mov_b32 s16, s14
2958 ; GFX9-W64-NEXT:    s_mov_b32 s15, s13
2959 ; GFX9-W64-NEXT:    s_mov_b32 s14, s12
2960 ; GFX9-W64-NEXT:    s_mov_b32 s13, s11
2961 ; GFX9-W64-NEXT:    s_mov_b32 s12, s10
2962 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
2963 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[30:31]
2964 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
2965 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2966 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2967 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
2968 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2969 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2970 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, s0
2971 ; GFX9-W64-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
2972 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2973 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2974 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2975 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
2976 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
2977 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2978 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2979 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2980 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
2981 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2982 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
2983 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
2984 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[28:29]
2985 ; GFX9-W64-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
2986 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2987 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
2988 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
2989 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2990 ; GFX9-W64-NEXT:    ; return to shader part epilog
2991 ;
2992 ; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
2993 ; GFX10-W32:       ; %bb.0: ; %main_body
2994 ; GFX10-W32-NEXT:    s_mov_b32 s28, exec_lo
2995 ; GFX10-W32-NEXT:    s_mov_b32 s19, s17
2996 ; GFX10-W32-NEXT:    s_mov_b32 s29, exec_lo
2997 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2998 ; GFX10-W32-NEXT:    s_mov_b32 s23, s5
2999 ; GFX10-W32-NEXT:    s_mov_b32 s22, s4
3000 ; GFX10-W32-NEXT:    s_mov_b32 s21, s3
3001 ; GFX10-W32-NEXT:    s_mov_b32 s20, s2
3002 ; GFX10-W32-NEXT:    s_mov_b32 s27, s9
3003 ; GFX10-W32-NEXT:    s_mov_b32 s26, s8
3004 ; GFX10-W32-NEXT:    s_mov_b32 s25, s7
3005 ; GFX10-W32-NEXT:    s_mov_b32 s24, s6
3006 ; GFX10-W32-NEXT:    s_mov_b32 s18, s16
3007 ; GFX10-W32-NEXT:    s_mov_b32 s17, s15
3008 ; GFX10-W32-NEXT:    s_mov_b32 s16, s14
3009 ; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3010 ; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3011 ; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3012 ; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3013 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
3014 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s29
3015 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3016 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
3017 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3018 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
3019 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3020 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
3021 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s0
3022 ; GFX10-W32-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
3023 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3024 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3025 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3026 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3027 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3028 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3029 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3030 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3031 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3032 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3033 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3034 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3035 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
3036 ; GFX10-W32-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
3037 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3038 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3039 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3040 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3041 ; GFX10-W32-NEXT:    ; return to shader part epilog
3042 main_body:
3043   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3044   %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3045   %temp = fadd float %reload, %reload
3046   %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
3047   %temp3 = fadd float %temp2, %temp2
3048   %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0)
3049   %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
3050   %temp5 = fadd float %temp3, %temp4
3051   %res.int = ptrtoint ptr addrspace(8) %res to i128
3052   %res.vec = bitcast i128 %res.int to <4 x i32>
3053   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3054   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3055   %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3056   ret float %out
3057 }
3058
3059 define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3060 ; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
3061 ; GFX9-W64:       ; %bb.0: ; %main_body
3062 ; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3063 ; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3064 ; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3065 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3066 ; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3067 ; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3068 ; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3069 ; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3070 ; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3071 ; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3072 ; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3073 ; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3074 ; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3075 ; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3076 ; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3077 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3078 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3079 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3080 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3081 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
3082 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3083 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3084 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3085 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3086 ; GFX9-W64-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3087 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3088 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
3089 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3090 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3091 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3092 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3093 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3094 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3095 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3096 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3097 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3098 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3099 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3100 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3101 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3102 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3103 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3104 ; GFX9-W64-NEXT:    ; return to shader part epilog
3105 ;
3106 ; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
3107 ; GFX10-W32:       ; %bb.0: ; %main_body
3108 ; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3109 ; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3110 ; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3111 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3112 ; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3113 ; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3114 ; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3115 ; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3116 ; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3117 ; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3118 ; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3119 ; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3120 ; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3121 ; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3122 ; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3123 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3124 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3125 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3126 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
3127 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3128 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3129 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3130 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3131 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3132 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3133 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3134 ; GFX10-W32-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3135 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3136 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3137 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3138 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3139 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3140 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3141 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3142 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3143 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3144 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3145 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3146 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3147 ; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3148 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3149 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3150 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3151 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3152 ; GFX10-W32-NEXT:    ; return to shader part epilog
3153 main_body:
3154   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3155   %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3156   %temp = fadd float %reload, %reload
3157   %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
3158   %temp3 = fadd float %temp2, %temp2
3159   %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3160   %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3161   %temp5 = fadd float %temp3, %temp4
3162   %res.int = ptrtoint ptr addrspace(8) %res to i128
3163   %res.vec = bitcast i128 %res.int to <4 x i32>
3164   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3165   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3166   %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3167   ret float %out
3168 }
3169
3170 ;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
3171 define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3172 ; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
3173 ; GFX9-W64:       ; %bb.0: ; %main_body
3174 ; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3175 ; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3176 ; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3177 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3178 ; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3179 ; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3180 ; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3181 ; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3182 ; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3183 ; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3184 ; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3185 ; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3186 ; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3187 ; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3188 ; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3189 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3190 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3191 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3192 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3193 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
3194 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 idxen
3195 ; GFX9-W64-NEXT:    s_nop 0
3196 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3197 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3198 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3199 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3200 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3201 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
3202 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3203 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3204 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v3
3205 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3206 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3207 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3208 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3209 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3210 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3211 ; GFX9-W64-NEXT:    ; return to shader part epilog
3212 ;
3213 ; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
3214 ; GFX10-W32:       ; %bb.0: ; %main_body
3215 ; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3216 ; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3217 ; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3218 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3219 ; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3220 ; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3221 ; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3222 ; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3223 ; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3224 ; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3225 ; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3226 ; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3227 ; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3228 ; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3229 ; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3230 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3231 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3232 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3233 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s1
3234 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3235 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3236 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3237 ; GFX10-W32-NEXT:    s_clause 0x1
3238 ; GFX10-W32-NEXT:    buffer_load_dword v0, v3, s[16:19], 0 idxen
3239 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3240 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3241 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3242 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3243 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
3244 ; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3245 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3246 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3247 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v3
3248 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3249 ; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3250 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3251 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3252 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3253 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3254 ; GFX10-W32-NEXT:    ; return to shader part epilog
3255 main_body:
3256   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3257   %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3258   %temp = fadd float %reload, %reload
3259   %res.int = ptrtoint ptr addrspace(8) %res to i128
3260   %res.vec = bitcast i128 %res.int to <4 x i32>
3261   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3262   %temp2 = fadd float %tex, %tex
3263   %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3264   %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3265   %temp4 = fadd float %temp2, %temp3
3266   %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3267   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3268   %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3269   ret float %out
3270 }
3271
3272 ; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
3273 ; vector comparisons in Wave32 mode.
3274 define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) {
3275 ; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
3276 ; GFX9-W64:       ; %bb.0: ; %main_body
3277 ; GFX9-W64-NEXT:    s_mov_b32 s3, 0x31016fac
3278 ; GFX9-W64-NEXT:    s_mov_b32 s2, 32
3279 ; GFX9-W64-NEXT:    s_mov_b32 s1, 0x8000
3280 ; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3281 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3282 ; GFX9-W64-NEXT:    v_cmp_le_f32_e64 vcc, s0, 0
3283 ; GFX9-W64-NEXT:    s_andn2_b64 s[4:5], exec, vcc
3284 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB54_1
3285 ; GFX9-W64-NEXT:    s_endpgm
3286 ; GFX9-W64-NEXT:  .LBB54_1:
3287 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3288 ; GFX9-W64-NEXT:    exp null off, off, off, off done vm
3289 ; GFX9-W64-NEXT:    s_endpgm
3290 ;
3291 ; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
3292 ; GFX10-W32:       ; %bb.0: ; %main_body
3293 ; GFX10-W32-NEXT:    s_mov_b32 s3, 0x31016fac
3294 ; GFX10-W32-NEXT:    s_mov_b32 s2, 32
3295 ; GFX10-W32-NEXT:    s_mov_b32 s1, 0x8000
3296 ; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3297 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3298 ; GFX10-W32-NEXT:    v_cmp_le_f32_e64 vcc_lo, s0, 0
3299 ; GFX10-W32-NEXT:    s_andn2_b32 s4, exec_lo, vcc_lo
3300 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB54_1
3301 ; GFX10-W32-NEXT:    s_endpgm
3302 ; GFX10-W32-NEXT:  .LBB54_1:
3303 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3304 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
3305 ; GFX10-W32-NEXT:    s_endpgm
3306 main_body:
3307   %1 = ptrtoint ptr addrspace(6) %0 to i32
3308   %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
3309   %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
3310   %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
3311   call void @llvm.amdgcn.kill(i1 %4) #1
3312   ret void
3313 }
3314
3315 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
3316 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
3317
3318 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
3319 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
3320 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
3321 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
3322 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
3323 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
3324
3325 declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3326 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3327 declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2
3328 declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2
3329 declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3
3330 declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3
3331
3332 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
3333 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3334 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3335 declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3336 declare void @llvm.amdgcn.kill(i1) #1
3337 declare float @llvm.amdgcn.wqm.f32(float) #3
3338 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
3339 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
3340 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
3341 declare float @llvm.amdgcn.wwm.f32(float) #3
3342 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
3343 declare float @llvm.amdgcn.strict.wqm.f32(float) #3
3344 declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
3345 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
3346 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
3347 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
3348 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
3349 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
3350 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3351 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
3352 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
3353 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
3354
3355 attributes #1 = { nounwind }
3356 attributes #2 = { nounwind readonly }
3357 attributes #3 = { nounwind readnone }
3358 attributes #4 = { nounwind readnone convergent }
3359 attributes #5 = { "amdgpu-ps-wqm-outputs" }
3360 attributes #6 = { nounwind "InitialPSInputAddr"="2" }
3361 attributes #7 = { nounwind readnone willreturn }