llvm/test/CodeGen/AMDGPU/wqm.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
   3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
   4
   5 ; Check that WQM isn't triggered by image load/store intrinsics.
   6 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
   7 ; GFX9-W64-LABEL: test1:
   8 ; GFX9-W64:       ; %bb.0: ; %main_body
   9 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
  10 ; GFX9-W64-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm
  11 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  12 ; GFX9-W64-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm
  13 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  14 ; GFX9-W64-NEXT:    ; return to shader part epilog
  15 ;
  16 ; GFX10-W32-LABEL: test1:
  17 ; GFX10-W32:       ; %bb.0: ; %main_body
  18 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
  19 ; GFX10-W32-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
  20 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  21 ; GFX10-W32-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
  22 ; GFX10-W32-NEXT:    ; return to shader part epilog
  23 main_body:
  24   %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
  25   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
  26   ret <4 x float> %tex
  27 }
  28
  29 ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
  30 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
  31 ; GFX9-W64-LABEL: test2:
  32 ; GFX9-W64:       ; %bb.0: ; %main_body
  33 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
  34 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
  35 ; GFX9-W64-NEXT:    s_mov_b32 m0, s3
  36 ; GFX9-W64-NEXT:    s_nop 0
  37 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
  38 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
  39 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
  40 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
  41 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
  42 ; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
  43 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  44 ; GFX9-W64-NEXT:    ; return to shader part epilog
  45 ;
  46 ; GFX10-W32-LABEL: test2:
  47 ; GFX10-W32:       ; %bb.0: ; %main_body
  48 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
  49 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  50 ; GFX10-W32-NEXT:    s_mov_b32 m0, s3
  51 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
  52 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
  53 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
  54 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
  55 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
  56 ; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
  57 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  58 ; GFX10-W32-NEXT:    ; return to shader part epilog
  59 main_body:
  60   %inst23 = extractelement <2 x float> %pos, i32 0
  61   %inst24 = extractelement <2 x float> %pos, i32 1
  62   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
  63   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
  64   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
  65   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
  66   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
  67   ret <4 x float> %tex
  68 }
  69
  70 ; ... but disabled for stores (and, in this simple case, not re-enabled) ...
  71 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
  72 ; GFX9-W64-LABEL: test3:
  73 ; GFX9-W64:       ; %bb.0: ; %main_body
  74 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
  75 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
  76 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
  77 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
  78 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  79 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
  80 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
  81 ; GFX9-W64-NEXT:    ; return to shader part epilog
  82 ;
  83 ; GFX10-W32-LABEL: test3:
  84 ; GFX10-W32:       ; %bb.0: ; %main_body
  85 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
  86 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
  87 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
  88 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
  89 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
  90 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
  91 ; GFX10-W32-NEXT:    ; return to shader part epilog
  92 main_body:
  93   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
  94   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
  95   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
  96
  97   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
  98
  99   ret <4 x float> %tex
 100 }
 101
 102 define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
 103 ; GFX9-W64-LABEL: test3_ptr_buf:
 104 ; GFX9-W64:       ; %bb.0: ; %main_body
 105 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
 106 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 107 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
 108 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 109 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 110 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
 111 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 112 ; GFX9-W64-NEXT:    ; return to shader part epilog
 113 ;
 114 ; GFX10-W32-LABEL: test3_ptr_buf:
 115 ; GFX10-W32:       ; %bb.0: ; %main_body
 116 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 117 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 118 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 119 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 120 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 121 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
 122 ; GFX10-W32-NEXT:    ; return to shader part epilog
 123 main_body:
 124   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 125   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
 126   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
 127
 128   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0)
 129
 130   ret <4 x float> %tex
 131 }
 132
 133 ; ... and disabled for export.
 134 define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
 135 ; GFX9-W64-LABEL: test3x:
 136 ; GFX9-W64:       ; %bb.0: ; %main_body
 137 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
 138 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 139 ; GFX9-W64-NEXT:    s_mov_b32 m0, s3
 140 ; GFX9-W64-NEXT:    s_nop 0
 141 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
 142 ; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
 143 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
 144 ; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
 145 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 146 ; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
 147 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 148 ; GFX9-W64-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
 149 ; GFX9-W64-NEXT:    s_endpgm
 150 ;
 151 ; GFX10-W32-LABEL: test3x:
 152 ; GFX10-W32:       ; %bb.0: ; %main_body
 153 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 154 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 155 ; GFX10-W32-NEXT:    s_mov_b32 m0, s3
 156 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
 157 ; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
 158 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
 159 ; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
 160 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 161 ; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
 162 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 163 ; GFX10-W32-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
 164 ; GFX10-W32-NEXT:    s_endpgm
 165 main_body:
 166   %inst23 = extractelement <2 x float> %pos, i32 0
 167   %inst24 = extractelement <2 x float> %pos, i32 1
 168   %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
 169   %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
 170   %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
 171   %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
 172   %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 173   %tex.0 = extractelement <4 x float> %tex, i32 0
 174   %tex.1 = extractelement <4 x float> %tex, i32 1
 175   %tex.2 = extractelement <4 x float> %tex, i32 2
 176   %tex.3 = extractelement <4 x float> %tex, i32 3
 177   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
 178   ret void
 179 }
 180
 181 ; Check that WQM is re-enabled when required.
 182 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
 183 ; GFX9-W64-LABEL: test4:
 184 ; GFX9-W64:       ; %bb.0: ; %main_body
 185 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
 186 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 187 ; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
 188 ; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
 189 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
 190 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 191 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 192 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 193 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 194 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 195 ; GFX9-W64-NEXT:    ; return to shader part epilog
 196 ;
 197 ; GFX10-W32-LABEL: test4:
 198 ; GFX10-W32:       ; %bb.0: ; %main_body
 199 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 200 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 201 ; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
 202 ; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 203 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 204 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 205 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 206 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 207 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 208 ; GFX10-W32-NEXT:    ; return to shader part epilog
 209 main_body:
 210   %c.1 = mul i32 %c, %d
 211
 212   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
 213   %c.1.bc = bitcast i32 %c.1 to float
 214   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 215   %tex0 = extractelement <4 x float> %tex, i32 0
 216   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 217   ret <4 x float> %dtex
 218 }
 219
 220 define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
 221 ; GFX9-W64-LABEL: test4_ptr_buf:
 222 ; GFX9-W64:       ; %bb.0: ; %main_body
 223 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
 224 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 225 ; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
 226 ; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
 227 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
 228 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 229 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 230 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 231 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 232 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 233 ; GFX9-W64-NEXT:    ; return to shader part epilog
 234 ;
 235 ; GFX10-W32-LABEL: test4_ptr_buf:
 236 ; GFX10-W32:       ; %bb.0: ; %main_body
 237 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
 238 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 239 ; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
 240 ; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 241 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 242 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 243 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 244 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 245 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 246 ; GFX10-W32-NEXT:    ; return to shader part epilog
 247 main_body:
 248   %c.1 = mul i32 %c, %d
 249
 250   call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0)
 251   %c.1.bc = bitcast i32 %c.1 to float
 252   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 253   %tex0 = extractelement <4 x float> %tex, i32 0
 254   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
 255   ret <4 x float> %dtex
 256 }
 257
 258 ; Check that WQM is triggered by the wqm intrinsic.
 259 ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
 260 ; does not happen - the v_add should write the return reg directly.
 261 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
 262 ; GFX9-W64-LABEL: test5:
 263 ; GFX9-W64:       ; %bb.0: ; %main_body
 264 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 265 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 266 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 267 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 268 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 269 ; GFX9-W64-NEXT:    s_nop 0
 270 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 271 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 272 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 273 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 274 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 275 ; GFX9-W64-NEXT:    ; return to shader part epilog
 276 ;
 277 ; GFX10-W32-LABEL: test5:
 278 ; GFX10-W32:       ; %bb.0: ; %main_body
 279 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 280 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 281 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 282 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 283 ; GFX10-W32-NEXT:    s_clause 0x1
 284 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 285 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 286 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 287 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 288 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 289 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 290 ; GFX10-W32-NEXT:    ; return to shader part epilog
 291 main_body:
 292   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 293   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 294   %out = fadd float %src0, %src1
 295   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 296   ret float %out.0
 297 }
 298
 299 define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
 300 ; GFX9-W64-LABEL: test5_ptr_buf:
 301 ; GFX9-W64:       ; %bb.0: ; %main_body
 302 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 303 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 304 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 305 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 306 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 307 ; GFX9-W64-NEXT:    s_nop 0
 308 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 309 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 310 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 311 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 312 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 313 ; GFX9-W64-NEXT:    ; return to shader part epilog
 314 ;
 315 ; GFX10-W32-LABEL: test5_ptr_buf:
 316 ; GFX10-W32:       ; %bb.0: ; %main_body
 317 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 318 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 319 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 320 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 321 ; GFX10-W32-NEXT:    s_clause 0x1
 322 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 323 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 324 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 325 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 326 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 327 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 328 ; GFX10-W32-NEXT:    ; return to shader part epilog
 329 main_body:
 330   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 331   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 332   %out = fadd float %src0, %src1
 333   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 334   ret float %out.0
 335 }
 336
 337 ; Check that the wqm intrinsic works correctly for integers.
 338 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
 339 ; GFX9-W64-LABEL: test6:
 340 ; GFX9-W64:       ; %bb.0: ; %main_body
 341 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 342 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 343 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 344 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 345 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 346 ; GFX9-W64-NEXT:    s_nop 0
 347 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 348 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 349 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 350 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 351 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 352 ; GFX9-W64-NEXT:    ; return to shader part epilog
 353 ;
 354 ; GFX10-W32-LABEL: test6:
 355 ; GFX10-W32:       ; %bb.0: ; %main_body
 356 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 357 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 358 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 359 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 360 ; GFX10-W32-NEXT:    s_clause 0x1
 361 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 362 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 363 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 364 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 365 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 366 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 367 ; GFX10-W32-NEXT:    ; return to shader part epilog
 368 main_body:
 369   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 370   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 371   %out = fadd float %src0, %src1
 372   %out.0 = bitcast float %out to i32
 373   %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
 374   %out.2 = bitcast i32 %out.1 to float
 375   ret float %out.2
 376 }
 377
 378 define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
 379 ; GFX9-W64-LABEL: test6_ptr_buf:
 380 ; GFX9-W64:       ; %bb.0: ; %main_body
 381 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 382 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 383 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 384 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 385 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 386 ; GFX9-W64-NEXT:    s_nop 0
 387 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 388 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 389 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
 390 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 391 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 392 ; GFX9-W64-NEXT:    ; return to shader part epilog
 393 ;
 394 ; GFX10-W32-LABEL: test6_ptr_buf:
 395 ; GFX10-W32:       ; %bb.0: ; %main_body
 396 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 397 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 398 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 399 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 400 ; GFX10-W32-NEXT:    s_clause 0x1
 401 ; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 402 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 403 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 404 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
 405 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 406 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 407 ; GFX10-W32-NEXT:    ; return to shader part epilog
 408 main_body:
 409   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 410   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 411   %out = fadd float %src0, %src1
 412   %out.0 = bitcast float %out to i32
 413   %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
 414   %out.2 = bitcast i32 %out.1 to float
 415   ret float %out.2
 416 }
 417
 418 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
 419
 420 ; Check that WWM is triggered by the wwm intrinsic.
 421 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 422 ; GFX9-W64-LABEL: test_wwm1:
 423 ; GFX9-W64:       ; %bb.0: ; %main_body
 424 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 425 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 426 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 427 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 428 ; GFX9-W64-NEXT:    s_nop 0
 429 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 430 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 431 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
 432 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 433 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 434 ; GFX9-W64-NEXT:    ; return to shader part epilog
 435 ;
 436 ; GFX10-W32-LABEL: test_wwm1:
 437 ; GFX10-W32:       ; %bb.0: ; %main_body
 438 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 439 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 440 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 441 ; GFX10-W32-NEXT:    s_clause 0x1
 442 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 443 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 444 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 445 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
 446 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 447 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 448 ; GFX10-W32-NEXT:    ; return to shader part epilog
 449 main_body:
 450   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 451   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 452   %out = fadd float %src0, %src1
 453   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 454   ret float %out.0
 455 }
 456
 457 ; Same as above, but with an integer type.
 458 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
 459 ; GFX9-W64-LABEL: test_wwm2:
 460 ; GFX9-W64:       ; %bb.0: ; %main_body
 461 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 462 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 463 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 464 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 465 ; GFX9-W64-NEXT:    s_nop 0
 466 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 467 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 468 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
 469 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 470 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 471 ; GFX9-W64-NEXT:    ; return to shader part epilog
 472 ;
 473 ; GFX10-W32-LABEL: test_wwm2:
 474 ; GFX10-W32:       ; %bb.0: ; %main_body
 475 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 476 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 477 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 478 ; GFX10-W32-NEXT:    s_clause 0x1
 479 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 480 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 481 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 482 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 483 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 484 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 485 ; GFX10-W32-NEXT:    ; return to shader part epilog
 486 main_body:
 487   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 488   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 489   %src0.0 = bitcast float %src0 to i32
 490   %src1.0 = bitcast float %src1 to i32
 491   %out = add i32 %src0.0, %src1.0
 492   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
 493   %out.1 = bitcast i32 %out.0 to float
 494   ret float %out.1
 495 }
 496
 497 ; Check that we don't leave WWM on for computations that don't require WWM,
 498 ; since that will lead clobbering things that aren't supposed to be clobbered
 499 ; in cases like this.
 500 ; We enforce this by checking that v_add gets emitted in the same block as
 501 ; WWM computations.
 502 define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
 503 ; GFX9-W64-LABEL: test_wwm3:
 504 ; GFX9-W64:       ; %bb.0: ; %main_body
 505 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 506 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 507 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 508 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 509 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 510 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB13_2
 511 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 512 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
 513 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 514 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 515 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 516 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
 517 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 518 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 519 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
 520 ; GFX9-W64-NEXT:  .LBB13_2: ; %endif
 521 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 522 ; GFX9-W64-NEXT:    ; return to shader part epilog
 523 ;
 524 ; GFX10-W32-LABEL: test_wwm3:
 525 ; GFX10-W32:       ; %bb.0: ; %main_body
 526 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 527 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 528 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 529 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 530 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 531 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB13_2
 532 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 533 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 534 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 535 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 536 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 537 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
 538 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 539 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
 540 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
 541 ; GFX10-W32-NEXT:  .LBB13_2: ; %endif
 542 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 543 ; GFX10-W32-NEXT:    ; return to shader part epilog
 544 main_body:
 545   ; use mbcnt to make sure the branch is divergent
 546   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 547   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 548   %cc = icmp uge i32 %hi, 16
 549   br i1 %cc, label %endif, label %if
 550
 551 if:
 552   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 553   %out = fadd float %src, %src
 554   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 555   %out.1 = fadd float %src, %out.0
 556   br label %endif
 557
 558 endif:
 559   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
 560   ret float %out.2
 561 }
 562
 563 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
 564 ; write could clobber disabled channels in the non-WWM one.
 565 ; We enforce this by checking that v_mov gets emitted in the same block as
 566 ; WWM computations.
 567 define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
 568 ; GFX9-W64-LABEL: test_wwm4:
 569 ; GFX9-W64:       ; %bb.0: ; %main_body
 570 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 571 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 572 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 573 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 574 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 575 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB14_2
 576 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 577 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
 578 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 579 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 580 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 581 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
 582 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 583 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 584 ; GFX9-W64-NEXT:  .LBB14_2: ; %endif
 585 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 586 ; GFX9-W64-NEXT:    ; return to shader part epilog
 587 ;
 588 ; GFX10-W32-LABEL: test_wwm4:
 589 ; GFX10-W32:       ; %bb.0: ; %main_body
 590 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 591 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 592 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 593 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 594 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 595 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB14_2
 596 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 597 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
 598 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 599 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 600 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 601 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
 602 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 603 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 604 ; GFX10-W32-NEXT:  .LBB14_2: ; %endif
 605 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 606 ; GFX10-W32-NEXT:    ; return to shader part epilog
 607 main_body:
 608   ; use mbcnt to make sure the branch is divergent
 609   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 610   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 611   %cc = icmp uge i32 %hi, 16
 612   br i1 %cc, label %endif, label %if
 613
 614 if:
 615   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 616   %out = fadd float %src, %src
 617   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 618   br label %endif
 619
 620 endif:
 621   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 622   ret float %out.1
 623 }
 624
 625 ; Make sure the transition from Exact to WWM then WQM works properly.
 626 define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
 627 ; GFX9-W64-LABEL: test_wwm5:
 628 ; GFX9-W64:       ; %bb.0: ; %main_body
 629 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 630 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
 631 ; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
 632 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 633 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
 634 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
 635 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
 636 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 637 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 638 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
 639 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 640 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 641 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 642 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
 643 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 644 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 645 ; GFX9-W64-NEXT:    ; return to shader part epilog
 646 ;
 647 ; GFX10-W32-LABEL: test_wwm5:
 648 ; GFX10-W32:       ; %bb.0: ; %main_body
 649 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
 650 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 651 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
 652 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 653 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
 654 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 655 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 656 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
 657 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 658 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 659 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 660 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
 661 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 662 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 663 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 664 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
 665 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 666 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 667 ; GFX10-W32-NEXT:    ; return to shader part epilog
 668 main_body:
 669   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 670   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 671   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 672   %temp = fadd float %src1, %src1
 673   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
 674   %out = fadd float %temp.0, %temp.0
 675   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
 676   ret float %out.0
 677 }
 678
 679 ; Check that WWM is turned on correctly across basic block boundaries.
 680 ; if..then..endif version
 681 ;SI-CHECK: buffer_load_dword
 682 ;VI-CHECK: flat_load_dword
 683 ;SI-CHECK: buffer_load_dword
 684 ;VI-CHECK: flat_load_dword
 685 define amdgpu_ps float @test_wwm6_then() {
 686 ; GFX9-W64-LABEL: test_wwm6_then:
 687 ; GFX9-W64:       ; %bb.0: ; %main_body
 688 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 689 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
 690 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 691 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 692 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 693 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 694 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 695 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 696 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 697 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB16_2
 698 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 699 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 700 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
 701 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 702 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
 703 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 704 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 705 ; GFX9-W64-NEXT:  .LBB16_2: ; %endif
 706 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
 707 ; GFX9-W64-NEXT:    ; return to shader part epilog
 708 ;
 709 ; GFX10-W32-LABEL: test_wwm6_then:
 710 ; GFX10-W32:       ; %bb.0: ; %main_body
 711 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 712 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
 713 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 714 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 715 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 716 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 717 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 718 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 719 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 720 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB16_2
 721 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 722 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
 723 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
 724 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 725 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
 726 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
 727 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 728 ; GFX10-W32-NEXT:  .LBB16_2: ; %endif
 729 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 730 ; GFX10-W32-NEXT:    ; return to shader part epilog
 731 main_body:
 732   %src0 = load volatile float, ptr addrspace(1) undef
 733   ; use mbcnt to make sure the branch is divergent
 734   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 735   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 736   %cc = icmp uge i32 %hi, 16
 737   br i1 %cc, label %endif, label %if
 738
 739 if:
 740   %src1 = load volatile float, ptr addrspace(1) undef
 741   %out = fadd float %src0, %src1
 742   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 743   br label %endif
 744
 745 endif:
 746   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
 747   ret float %out.1
 748 }
 749
 750 ; Check that WWM is turned on correctly across basic block boundaries.
 751 ; loop version
 752 ;SI-CHECK: buffer_load_dword
 753 ;VI-CHECK: flat_load_dword
 754 ;SI-CHECK: buffer_load_dword
 755 ;VI-CHECK: flat_load_dword
 756 define amdgpu_ps float @test_wwm6_loop() {
 757 ; GFX9-W64-LABEL: test_wwm6_loop:
 758 ; GFX9-W64:       ; %bb.0: ; %main_body
 759 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 760 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
 761 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 762 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 763 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 764 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 765 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
 766 ; GFX9-W64-NEXT:  .LBB17_1: ; %loop
 767 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 768 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 769 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
 770 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 771 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 772 ; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
 773 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 774 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
 775 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
 776 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 777 ; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 778 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 779 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 780 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB17_1
 781 ; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
 782 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
 783 ; GFX9-W64-NEXT:    ; return to shader part epilog
 784 ;
 785 ; GFX10-W32-LABEL: test_wwm6_loop:
 786 ; GFX10-W32:       ; %bb.0: ; %main_body
 787 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 788 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
 789 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 790 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 791 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 792 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
 793 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
 794 ; GFX10-W32-NEXT:  .LBB17_1: ; %loop
 795 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 796 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
 797 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
 798 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 799 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
 800 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 801 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
 802 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
 803 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
 804 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 805 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
 806 ; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
 807 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 808 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB17_1
 809 ; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
 810 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 811 ; GFX10-W32-NEXT:    ; return to shader part epilog
 812 main_body:
 813   %src0 = load volatile float, ptr addrspace(1) undef
 814   ; use mbcnt to make sure the branch is divergent
 815   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
 816   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 817   br label %loop
 818
 819 loop:
 820   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
 821   %src1 = load volatile float, ptr addrspace(1) undef
 822   %out = fadd float %src0, %src1
 823   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
 824   %counter.1 = sub i32 %counter, 1
 825   %cc = icmp ne i32 %counter.1, 0
 826   br i1 %cc, label %loop, label %endloop
 827
 828 endloop:
 829   ret float %out.0
 830 }
 831
 832 ; Check that @llvm.amdgcn.set.inactive disables WWM.
 833 define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
 834 ; GFX9-W64-LABEL: test_wwm_set_inactive1:
 835 ; GFX9-W64:       ; %bb.0: ; %main_body
 836 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 837 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 838 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 839 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 840 ; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
 841 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
 842 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 843 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
 844 ; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 845 ; GFX9-W64-NEXT:    s_endpgm
 846 ;
 847 ; GFX10-W32-LABEL: test_wwm_set_inactive1:
 848 ; GFX10-W32:       ; %bb.0: ; %main_body
 849 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 850 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 851 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
 852 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 853 ; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
 854 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
 855 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 856 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
 857 ; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 858 ; GFX10-W32-NEXT:    s_endpgm
 859 main_body:
 860   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 861   %src.0 = bitcast float %src to i32
 862   %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
 863   %out = add i32 %src.1, %src.1
 864   %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
 865   %out.1 = bitcast i32 %out.0 to float
 866   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
 867   ret void
 868 }
 869
 870 ; Check that Strict WQM is triggered by the strict_wqm intrinsic.
 871 define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
 872 ; GFX9-W64-LABEL: test_strict_wqm1:
 873 ; GFX9-W64:       ; %bb.0: ; %main_body
 874 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 875 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 876 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 877 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 878 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 879 ; GFX9-W64-NEXT:    s_nop 0
 880 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 881 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 882 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
 883 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 884 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 885 ; GFX9-W64-NEXT:    ; return to shader part epilog
 886 ;
 887 ; GFX10-W32-LABEL: test_strict_wqm1:
 888 ; GFX10-W32:       ; %bb.0: ; %main_body
 889 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 890 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 891 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 892 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 893 ; GFX10-W32-NEXT:    s_clause 0x1
 894 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 895 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 896 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 897 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
 898 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 899 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 900 ; GFX10-W32-NEXT:    ; return to shader part epilog
 901 main_body:
 902   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 903   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 904   %out = fadd float %src0, %src1
 905   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
 906   ret float %out.0
 907 }
 908
 909 ; Same as above, but with an integer type.
 910 define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
 911 ; GFX9-W64-LABEL: test_strict_wqm2:
 912 ; GFX9-W64:       ; %bb.0: ; %main_body
 913 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
 914 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 915 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 916 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
 917 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 918 ; GFX9-W64-NEXT:    s_nop 0
 919 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 920 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 921 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
 922 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
 923 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
 924 ; GFX9-W64-NEXT:    ; return to shader part epilog
 925 ;
 926 ; GFX10-W32-LABEL: test_strict_wqm2:
 927 ; GFX10-W32:       ; %bb.0: ; %main_body
 928 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 929 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 930 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 931 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
 932 ; GFX10-W32-NEXT:    s_clause 0x1
 933 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 934 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 935 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 936 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 937 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 938 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 939 ; GFX10-W32-NEXT:    ; return to shader part epilog
 940 main_body:
 941   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
 942   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
 943   %src0.0 = bitcast float %src0 to i32
 944   %src1.0 = bitcast float %src1 to i32
 945   %out = add i32 %src0.0, %src1.0
 946   %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
 947   %out.1 = bitcast i32 %out.0 to float
 948   ret float %out.1
 949 }
 950
 951 ; Check that we don't leave Strict WQM on for computations that don't require it,
 952 ; since that will lead clobbering things that aren't supposed to be clobbered
 953 ; in cases like this.
 954 ; We enforce this by checking that v_add gets emitted in the same block as
 955 ; WWM computations.
 956 define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
 957 ; GFX9-W64-LABEL: test_strict_wqm3:
 958 ; GFX9-W64:       ; %bb.0: ; %main_body
 959 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 960 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 961 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
 962 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 963 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 964 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB21_2
 965 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
 966 ; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
 967 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 968 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 969 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 970 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 971 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
 972 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
 973 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
 974 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
 975 ; GFX9-W64-NEXT:  .LBB21_2: ; %endif
 976 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 977 ; GFX9-W64-NEXT:    ; return to shader part epilog
 978 ;
 979 ; GFX10-W32-LABEL: test_strict_wqm3:
 980 ; GFX10-W32:       ; %bb.0: ; %main_body
 981 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 982 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
 983 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
 984 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 985 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 986 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB21_2
 987 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
 988 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
 989 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 990 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 991 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 992 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 993 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
 994 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
 995 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
 996 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
 997 ; GFX10-W32-NEXT:  .LBB21_2: ; %endif
 998 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 999 ; GFX10-W32-NEXT:    ; return to shader part epilog
1000 main_body:
1001   ; use mbcnt to make sure the branch is divergent
1002   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1003   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1004   %cc = icmp uge i32 %hi, 16
1005   br i1 %cc, label %endif, label %if
1006
1007 if:
1008   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1009   %out = fadd float %src, %src
1010   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1011   %out.1 = fadd float %src, %out.0
1012   br label %endif
1013
1014 endif:
1015   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
1016   ret float %out.2
1017 }
1018
1019 ; Check that Strict WQM writes aren't coalesced with non-strict writes, since
1020 ; the Strict WQM write could clobber disabled channels in the non-strict one.
1021 ; We enforce this by checking that v_mov gets emitted in the same block as
1022 ; WWM computations.
1023 define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
1024 ; GFX9-W64-LABEL: test_strict_wqm4:
1025 ; GFX9-W64:       ; %bb.0: ; %main_body
1026 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1027 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1028 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1029 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1030 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1031 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB22_2
1032 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
1033 ; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
1034 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1035 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
1036 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1037 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1038 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
1039 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
1040 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1041 ; GFX9-W64-NEXT:  .LBB22_2: ; %endif
1042 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
1043 ; GFX9-W64-NEXT:    ; return to shader part epilog
1044 ;
1045 ; GFX10-W32-LABEL: test_strict_wqm4:
1046 ; GFX10-W32:       ; %bb.0: ; %main_body
1047 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1048 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1049 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1050 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1051 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1052 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB22_2
1053 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
1054 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1055 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1056 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1057 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1058 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1059 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
1060 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
1061 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1062 ; GFX10-W32-NEXT:  .LBB22_2: ; %endif
1063 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1064 ; GFX10-W32-NEXT:    ; return to shader part epilog
1065 main_body:
1066   ; use mbcnt to make sure the branch is divergent
1067   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1068   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1069   %cc = icmp uge i32 %hi, 16
1070   br i1 %cc, label %endif, label %if
1071
1072 if:
1073   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1074   %out = fadd float %src, %src
1075   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1076   br label %endif
1077
1078 endif:
1079   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1080   ret float %out.1
1081 }
1082
1083 ; Make sure the transition from Exact to Strict WQM then WQM works properly.
1084 define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
1085 ; GFX9-W64-LABEL: test_strict_wqm5:
1086 ; GFX9-W64:       ; %bb.0: ; %main_body
1087 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1088 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
1089 ; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1090 ; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
1091 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1092 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1093 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1094 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
1095 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1096 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1097 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
1098 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
1099 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1100 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1101 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
1102 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1103 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1104 ; GFX9-W64-NEXT:    ; return to shader part epilog
1105 ;
1106 ; GFX10-W32-LABEL: test_strict_wqm5:
1107 ; GFX10-W32:       ; %bb.0: ; %main_body
1108 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
1109 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1110 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1111 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1112 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1113 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
1114 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1115 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1116 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1117 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1118 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1119 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1120 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1121 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
1122 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1123 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1124 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1125 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
1126 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1127 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1128 ; GFX10-W32-NEXT:    ; return to shader part epilog
1129 main_body:
1130   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1131   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1132   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1133   %temp = fadd float %src1, %src1
1134   %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
1135   %out = fadd float %temp.0, %temp.0
1136   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
1137   ret float %out.0
1138 }
1139
1140 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1141 ; if..then..endif version
1142 ;SI-CHECK: buffer_load_dword
1143 ;VI-CHECK: flat_load_dword
1144 ;SI-CHECK: buffer_load_dword
1145 ;VI-CHECK: flat_load_dword
1146 define amdgpu_ps float @test_strict_wqm6_then() {
1147 ; GFX9-W64-LABEL: test_strict_wqm6_then:
1148 ; GFX9-W64:       ; %bb.0: ; %main_body
1149 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1150 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1151 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1152 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1153 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1154 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1155 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1156 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1157 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1158 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1159 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB24_2
1160 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
1161 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1162 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1163 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1164 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1165 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
1166 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1167 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1168 ; GFX9-W64-NEXT:  .LBB24_2: ; %endif
1169 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1170 ; GFX9-W64-NEXT:    ; return to shader part epilog
1171 ;
1172 ; GFX10-W32-LABEL: test_strict_wqm6_then:
1173 ; GFX10-W32:       ; %bb.0: ; %main_body
1174 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1175 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1176 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1177 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1178 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1179 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1180 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1181 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1182 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1183 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1184 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
1185 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
1186 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1187 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1188 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1189 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1190 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
1191 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1192 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1193 ; GFX10-W32-NEXT:  .LBB24_2: ; %endif
1194 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1195 ; GFX10-W32-NEXT:    ; return to shader part epilog
1196 main_body:
1197   %src0 = load volatile float, ptr addrspace(1) undef
1198   ; use mbcnt to make sure the branch is divergent
1199   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1200   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1201   %cc = icmp uge i32 %hi, 16
1202   br i1 %cc, label %endif, label %if
1203
1204 if:
1205   %src1 = load volatile float, ptr addrspace(1) undef
1206   %out = fadd float %src0, %src1
1207   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1208   br label %endif
1209
1210 endif:
1211   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1212   ret float %out.1
1213 }
1214
1215 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1216 ; loop version
1217 ;SI-CHECK: buffer_load_dword
1218 ;VI-CHECK: flat_load_dword
1219 ;SI-CHECK: buffer_load_dword
1220 ;VI-CHECK: flat_load_dword
1221 define amdgpu_ps float @test_strict_wqm6_loop() {
1222 ; GFX9-W64-LABEL: test_strict_wqm6_loop:
1223 ; GFX9-W64:       ; %bb.0: ; %main_body
1224 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1225 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1226 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1227 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1228 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1229 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1230 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1231 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
1232 ; GFX9-W64-NEXT:  .LBB25_1: ; %loop
1233 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1234 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1235 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1236 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1237 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1238 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1239 ; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
1240 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1241 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1242 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1243 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
1244 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1245 ; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1246 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1247 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1248 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB25_1
1249 ; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
1250 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1251 ; GFX9-W64-NEXT:    ; return to shader part epilog
1252 ;
1253 ; GFX10-W32-LABEL: test_strict_wqm6_loop:
1254 ; GFX10-W32:       ; %bb.0: ; %main_body
1255 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1256 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1257 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1258 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1259 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1260 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1261 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
1262 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1263 ; GFX10-W32-NEXT:  .LBB25_1: ; %loop
1264 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1265 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1266 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1267 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1268 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1269 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1270 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
1271 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1272 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1273 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
1274 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1275 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1276 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1277 ; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
1278 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
1279 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB25_1
1280 ; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
1281 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1282 ; GFX10-W32-NEXT:    ; return to shader part epilog
1283 main_body:
1284   %src0 = load volatile float, ptr addrspace(1) undef
1285   ; use mbcnt to make sure the branch is divergent
1286   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1287   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1288   br label %loop
1289
1290 loop:
1291   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
1292   %src1 = load volatile float, ptr addrspace(1) undef
1293   %out = fadd float %src0, %src1
1294   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1295   %counter.1 = sub i32 %counter, 1
1296   %cc = icmp ne i32 %counter.1, 0
1297   br i1 %cc, label %loop, label %endloop
1298
1299 endloop:
1300   ret float %out.0
1301 }
1302
1303 ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
1304 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
1305 ; GFX9-W64-LABEL: test_set_inactive2:
1306 ; GFX9-W64:       ; %bb.0: ; %main_body
1307 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1308 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1309 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
1310 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s0
1311 ; GFX9-W64-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
1312 ; GFX9-W64-NEXT:    s_nop 0
1313 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
1314 ; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
1315 ; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
1316 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1317 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1318 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v2, v1
1319 ; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1320 ; GFX9-W64-NEXT:    s_endpgm
1321 ;
1322 ; GFX10-W32-LABEL: test_set_inactive2:
1323 ; GFX10-W32:       ; %bb.0: ; %main_body
1324 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1325 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1326 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s1
1327 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1328 ; GFX10-W32-NEXT:    s_clause 0x1
1329 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1330 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1331 ; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
1332 ; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec
1333 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1334 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1335 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1336 ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1337 ; GFX10-W32-NEXT:    s_endpgm
1338 main_body:
1339   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1340   %src1.0 = bitcast float %src1 to i32
1341   %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
1342   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1343   %src0.0 = bitcast float %src0 to i32
1344   %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
1345   %out = add i32 %src0.1, %src1.1
1346   %out.0 = bitcast i32 %out to float
1347   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1348   ret void
1349 }
1350
1351 ; Check a case of one branch of an if-else requiring WQM, the other requiring
1352 ; exact.
1353 ; Note: In this particular case, the save-and-restore could be avoided if the
1354 ; analysis understood that the two branches of the if-else are mutually
1355 ; exclusive.
1356 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1357 ; GFX9-W64-LABEL: test_control_flow_0:
1358 ; GFX9-W64:       ; %bb.0: ; %main_body
1359 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1360 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1361 ; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1362 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1363 ; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1364 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_2
1365 ; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1366 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1367 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1368 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1369 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1370 ; GFX9-W64-NEXT:  .LBB27_2: ; %Flow
1371 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1372 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_4
1373 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1374 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1375 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1376 ; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1377 ; GFX9-W64-NEXT:  .LBB27_4: ; %END
1378 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1379 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1380 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1381 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1382 ; GFX9-W64-NEXT:    ; return to shader part epilog
1383 ;
1384 ; GFX10-W32-LABEL: test_control_flow_0:
1385 ; GFX10-W32:       ; %bb.0: ; %main_body
1386 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1387 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1388 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1389 ; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1390 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1391 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
1392 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1393 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1394 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1395 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1396 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1397 ; GFX10-W32-NEXT:  .LBB27_2: ; %Flow
1398 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1399 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_4
1400 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1401 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1402 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1403 ; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1404 ; GFX10-W32-NEXT:  .LBB27_4: ; %END
1405 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1406 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1407 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1408 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1409 ; GFX10-W32-NEXT:    ; return to shader part epilog
1410 main_body:
1411   %cmp = icmp eq i32 %z, 0
1412   br i1 %cmp, label %IF, label %ELSE
1413
1414 IF:
1415   %c.bc = bitcast i32 %c to float
1416   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1417   %tex0 = extractelement <4 x float> %tex, i32 0
1418   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1419   %data.if = extractelement <4 x float> %dtex, i32 0
1420   br label %END
1421
1422 ELSE:
1423   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1424   br label %END
1425
1426 END:
1427   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1428   ret float %r
1429 }
1430
1431 ; Reverse branch order compared to the previous test.
1432 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1433 ; GFX9-W64-LABEL: test_control_flow_1:
1434 ; GFX9-W64:       ; %bb.0: ; %main_body
1435 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1436 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1437 ; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1438 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1439 ; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1440 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB28_2
1441 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1442 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1443 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1444 ; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1445 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1446 ; GFX9-W64-NEXT:  .LBB28_2: ; %Flow
1447 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], s[14:15]
1448 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1449 ; GFX9-W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
1450 ; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
1451 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB28_4
1452 ; GFX9-W64-NEXT:  ; %bb.3: ; %ELSE
1453 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1454 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1455 ; GFX9-W64-NEXT:  .LBB28_4: ; %END
1456 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1457 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1458 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1459 ; GFX9-W64-NEXT:    ; return to shader part epilog
1460 ;
1461 ; GFX10-W32-LABEL: test_control_flow_1:
1462 ; GFX10-W32:       ; %bb.0: ; %main_body
1463 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1464 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1465 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1466 ; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1467 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1468 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB28_2
1469 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1470 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1471 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1472 ; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1473 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1474 ; GFX10-W32-NEXT:  .LBB28_2: ; %Flow
1475 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, s13
1476 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1477 ; GFX10-W32-NEXT:    s_and_b32 s0, exec_lo, s0
1478 ; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1479 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB28_4
1480 ; GFX10-W32-NEXT:  ; %bb.3: ; %ELSE
1481 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1482 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1483 ; GFX10-W32-NEXT:  .LBB28_4: ; %END
1484 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1485 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1486 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1487 ; GFX10-W32-NEXT:    ; return to shader part epilog
1488 main_body:
1489   %cmp = icmp eq i32 %z, 0
1490   br i1 %cmp, label %ELSE, label %IF
1491
1492 IF:
1493   %c.bc = bitcast i32 %c to float
1494   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1495   %tex0 = extractelement <4 x float> %tex, i32 0
1496   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1497   %data.if = extractelement <4 x float> %dtex, i32 0
1498   br label %END
1499
1500 ELSE:
1501   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1502   br label %END
1503
1504 END:
1505   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1506   ret float %r
1507 }
1508
1509 ; Check that branch conditions are properly marked as needing WQM...
1510 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
1511 ; GFX9-W64-LABEL: test_control_flow_2:
1512 ; GFX9-W64:       ; %bb.0: ; %main_body
1513 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1514 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1515 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1516 ; GFX9-W64-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1517 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1518 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1519 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1520 ; GFX9-W64-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1521 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1522 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
1523 ; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
1524 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1525 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1526 ; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1527 ; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1528 ; GFX9-W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1529 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr5
1530 ; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
1531 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1532 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1533 ; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
1534 ; GFX9-W64-NEXT:  ; %bb.4: ; %END
1535 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1536 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1537 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1538 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1539 ; GFX9-W64-NEXT:    ; return to shader part epilog
1540 ;
1541 ; GFX10-W32-LABEL: test_control_flow_2:
1542 ; GFX10-W32:       ; %bb.0: ; %main_body
1543 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1544 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1545 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1546 ; GFX10-W32-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1547 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1548 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1549 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1550 ; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v0
1551 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1552 ; GFX10-W32-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1553 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1554 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1555 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1556 ; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1557 ; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1558 ; GFX10-W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1559 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr5
1560 ; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
1561 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1562 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1563 ; GFX10-W32-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
1564 ; GFX10-W32-NEXT:  ; %bb.4: ; %END
1565 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1566 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1567 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1568 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1569 ; GFX10-W32-NEXT:    ; return to shader part epilog
1570 main_body:
1571   %idx.1 = extractelement <3 x i32> %idx, i32 0
1572   %data.1 = extractelement <2 x float> %data, i32 0
1573   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1574
1575   ; The load that determines the branch (and should therefore be WQM) is
1576   ; surrounded by stores that require disabled WQM.
1577   %idx.2 = extractelement <3 x i32> %idx, i32 1
1578   %z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0)
1579
1580   %idx.3 = extractelement <3 x i32> %idx, i32 2
1581   %data.3 = extractelement <2 x float> %data, i32 1
1582   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0)
1583
1584   %cc = fcmp ogt float %z, 0.0
1585   br i1 %cc, label %IF, label %ELSE
1586
1587 IF:
1588   %coord.IF = mul i32 %coord, 3
1589   br label %END
1590
1591 ELSE:
1592   %coord.ELSE = mul i32 %coord, 4
1593   br label %END
1594
1595 END:
1596   %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
1597   %coord.END.bc = bitcast i32 %coord.END to float
1598   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1599   ret <4 x float> %tex
1600 }
1601
1602 ; ... but only if they really do need it.
1603 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
1604 ; GFX9-W64-LABEL: test_control_flow_3:
1605 ; GFX9-W64:       ; %bb.0: ; %main_body
1606 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1607 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1608 ; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1609 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1610 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1611 ; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1612 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1613 ; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v1
1614 ; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1615 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1616 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1617 ; GFX9-W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1618 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB30_3
1619 ; GFX9-W64-NEXT:  ; %bb.1: ; %Flow
1620 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1621 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB30_4
1622 ; GFX9-W64-NEXT:  .LBB30_2: ; %END
1623 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1624 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1625 ; GFX9-W64-NEXT:    s_branch .LBB30_5
1626 ; GFX9-W64-NEXT:  .LBB30_3: ; %ELSE
1627 ; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1628 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr1
1629 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1630 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB30_2
1631 ; GFX9-W64-NEXT:  .LBB30_4: ; %IF
1632 ; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1633 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1634 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1635 ; GFX9-W64-NEXT:    s_branch .LBB30_5
1636 ; GFX9-W64-NEXT:  .LBB30_5:
1637 ;
1638 ; GFX10-W32-LABEL: test_control_flow_3:
1639 ; GFX10-W32:       ; %bb.0: ; %main_body
1640 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1641 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1642 ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1643 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1644 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1645 ; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1646 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1647 ; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1648 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1649 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1650 ; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e32 0, v1
1651 ; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
1652 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB30_3
1653 ; GFX10-W32-NEXT:  ; %bb.1: ; %Flow
1654 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1655 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB30_4
1656 ; GFX10-W32-NEXT:  .LBB30_2: ; %END
1657 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1658 ; GFX10-W32-NEXT:    s_branch .LBB30_5
1659 ; GFX10-W32-NEXT:  .LBB30_3: ; %ELSE
1660 ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1661 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr1
1662 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1663 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB30_2
1664 ; GFX10-W32-NEXT:  .LBB30_4: ; %IF
1665 ; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1666 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1667 ; GFX10-W32-NEXT:    s_branch .LBB30_5
1668 ; GFX10-W32-NEXT:  .LBB30_5:
1669 main_body:
1670   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1671   %tex0 = extractelement <4 x float> %tex, i32 0
1672   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1673   %dtex.1 = extractelement <4 x float> %dtex, i32 0
1674   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1675
1676   %cc = fcmp ogt float %dtex.1, 0.0
1677   br i1 %cc, label %IF, label %ELSE
1678
1679 IF:
1680   %tex.IF = fmul float %dtex.1, 3.0
1681   br label %END
1682
1683 ELSE:
1684   %tex.ELSE = fmul float %dtex.1, 4.0
1685   br label %END
1686
1687 END:
1688   %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
1689   ret float %tex.END
1690 }
1691
1692 ; Another test that failed at some point because of terminator handling.
1693 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
1694 ; GFX9-W64-LABEL: test_control_flow_4:
1695 ; GFX9-W64:       ; %bb.0: ; %main_body
1696 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1697 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1698 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1699 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1700 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB31_2
1701 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1702 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1703 ; GFX9-W64-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1704 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 1
1705 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1706 ; GFX9-W64-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1707 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1708 ; GFX9-W64-NEXT:  .LBB31_2: ; %END
1709 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1710 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1711 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1712 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1713 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1714 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1715 ; GFX9-W64-NEXT:    ; return to shader part epilog
1716 ;
1717 ; GFX10-W32-LABEL: test_control_flow_4:
1718 ; GFX10-W32:       ; %bb.0: ; %main_body
1719 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1720 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1721 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1722 ; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
1723 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB31_2
1724 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1725 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1726 ; GFX10-W32-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1727 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 1
1728 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1729 ; GFX10-W32-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1730 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1731 ; GFX10-W32-NEXT:  .LBB31_2: ; %END
1732 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1733 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1734 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1735 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1736 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1737 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1738 ; GFX10-W32-NEXT:    ; return to shader part epilog
1739 main_body:
1740   %cond = icmp eq i32 %y, 0
1741   br i1 %cond, label %IF, label %END
1742
1743 IF:
1744   %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1745   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
1746   br label %END
1747
1748 END:
1749   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1750   %tex0 = extractelement <4 x float> %tex, i32 0
1751   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1752   ret <4 x float> %dtex
1753 }
1754
1755 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
1756 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
1757 ; GFX9-W64-LABEL: test_kill_0:
1758 ; GFX9-W64:       ; %bb.0: ; %main_body
1759 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1760 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1761 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1762 ; GFX9-W64-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
1763 ; GFX9-W64-NEXT:    s_nop 0
1764 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1765 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1766 ; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v6
1767 ; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1768 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB32_2
1769 ; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1770 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1771 ; GFX9-W64-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
1772 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1773 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1774 ; GFX9-W64-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
1775 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1776 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v7, v11
1777 ; GFX9-W64-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1778 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v8, v12
1779 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v9, v13
1780 ; GFX9-W64-NEXT:    v_add_f32_e32 v3, v10, v14
1781 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1782 ; GFX9-W64-NEXT:    s_branch .LBB32_3
1783 ; GFX9-W64-NEXT:  .LBB32_2:
1784 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1785 ; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1786 ; GFX9-W64-NEXT:    s_endpgm
1787 ; GFX9-W64-NEXT:  .LBB32_3:
1788 ;
1789 ; GFX10-W32-LABEL: test_kill_0:
1790 ; GFX10-W32:       ; %bb.0: ; %main_body
1791 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1792 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1793 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1794 ; GFX10-W32-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1795 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1796 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1797 ; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v6
1798 ; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1799 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB32_2
1800 ; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1801 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1802 ; GFX10-W32-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1803 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1804 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1805 ; GFX10-W32-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1806 ; GFX10-W32-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1807 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1808 ; GFX10-W32-NEXT:    v_add_f32_e32 v4, v8, v12
1809 ; GFX10-W32-NEXT:    v_add_f32_e32 v5, v10, v14
1810 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v7, v11
1811 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v9, v13
1812 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v4
1813 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v5
1814 ; GFX10-W32-NEXT:    s_branch .LBB32_3
1815 ; GFX10-W32-NEXT:  .LBB32_2:
1816 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1817 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1818 ; GFX10-W32-NEXT:    s_endpgm
1819 ; GFX10-W32-NEXT:  .LBB32_3:
1820 main_body:
1821   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1822   %idx.0 = extractelement <2 x i32> %idx, i32 0
1823   %data.0 = extractelement <2 x float> %data, i32 0
1824   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0)
1825
1826   %z.cmp = fcmp olt float %z, 0.0
1827   call void @llvm.amdgcn.kill(i1 %z.cmp)
1828
1829   %idx.1 = extractelement <2 x i32> %idx, i32 1
1830   %data.1 = extractelement <2 x float> %data, i32 1
1831   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1832   %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1833   %tex2.0 = extractelement <4 x float> %tex2, i32 0
1834   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1835   %out = fadd <4 x float> %tex, %dtex
1836
1837   ret <4 x float> %out
1838 }
1839
1840 ; ... but only if WQM is necessary.
1841 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
1842 ; GFX9-W64-LABEL: test_kill_1:
1843 ; GFX9-W64:       ; %bb.0: ; %main_body
1844 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1845 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1846 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v0
1847 ; GFX9-W64-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1848 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v2
1849 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1850 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1851 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1852 ; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v4
1853 ; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1854 ; GFX9-W64-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1855 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB33_2
1856 ; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1857 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1858 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1859 ; GFX9-W64-NEXT:    s_branch .LBB33_3
1860 ; GFX9-W64-NEXT:  .LBB33_2:
1861 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1862 ; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1863 ; GFX9-W64-NEXT:    s_endpgm
1864 ; GFX9-W64-NEXT:  .LBB33_3:
1865 ;
1866 ; GFX10-W32-LABEL: test_kill_1:
1867 ; GFX10-W32:       ; %bb.0: ; %main_body
1868 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1869 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1870 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v0
1871 ; GFX10-W32-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1872 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v2
1873 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1874 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1875 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1876 ; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v4
1877 ; GFX10-W32-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1878 ; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1879 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB33_2
1880 ; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1881 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1882 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1883 ; GFX10-W32-NEXT:    s_branch .LBB33_3
1884 ; GFX10-W32-NEXT:  .LBB33_2:
1885 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1886 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1887 ; GFX10-W32-NEXT:    s_endpgm
1888 ; GFX10-W32-NEXT:  .LBB33_3:
1889 main_body:
1890   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1891   %tex0 = extractelement <4 x float> %tex, i32 0
1892   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1893
1894   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1895
1896   %z.cmp = fcmp olt float %z, 0.0
1897   call void @llvm.amdgcn.kill(i1 %z.cmp)
1898
1899   ret <4 x float> %dtex
1900 }
1901
1902 ; Check prolog shaders.
1903 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
1904 ; GFX9-W64-LABEL: test_prolog_1:
1905 ; GFX9-W64:       ; %bb.0: ; %main_body
1906 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1907 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1908 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
1909 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1910 ; GFX9-W64-NEXT:    ; return to shader part epilog
1911 ;
1912 ; GFX10-W32-LABEL: test_prolog_1:
1913 ; GFX10-W32:       ; %bb.0: ; %main_body
1914 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1915 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1916 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
1917 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1918 ; GFX10-W32-NEXT:    ; return to shader part epilog
1919 main_body:
1920   %s = fadd float %a, %b
1921   ret float %s
1922 }
1923
1924 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
1925 ; GFX9-W64-LABEL: test_loop_vcc:
1926 ; GFX9-W64:       ; %bb.0: ; %entry
1927 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1928 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1929 ; GFX9-W64-NEXT:    v_mov_b32_e32 v7, v3
1930 ; GFX9-W64-NEXT:    v_mov_b32_e32 v6, v2
1931 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v1
1932 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
1933 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1934 ; GFX9-W64-NEXT:    image_store v[4:7], v0, s[0:7] dmask:0xf unorm
1935 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1936 ; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
1937 ; GFX9-W64-NEXT:    s_mov_b32 s4, 0x40e00000
1938 ; GFX9-W64-NEXT:    s_branch .LBB35_2
1939 ; GFX9-W64-NEXT:  .LBB35_1: ; %body
1940 ; GFX9-W64-NEXT:    ; in Loop: Header=BB35_2 Depth=1
1941 ; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
1942 ; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
1943 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_4
1944 ; GFX9-W64-NEXT:  .LBB35_2: ; %loop
1945 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1946 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1947 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
1948 ; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v8
1949 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v5
1950 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
1951 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
1952 ; GFX9-W64-NEXT:    s_cbranch_vccz .LBB35_1
1953 ; GFX9-W64-NEXT:  ; %bb.3:
1954 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1955 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
1956 ; GFX9-W64-NEXT:  .LBB35_4: ; %break
1957 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1958 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1959 ; GFX9-W64-NEXT:    ; return to shader part epilog
1960 ;
1961 ; GFX10-W32-LABEL: test_loop_vcc:
1962 ; GFX10-W32:       ; %bb.0: ; %entry
1963 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1964 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1965 ; GFX10-W32-NEXT:    v_mov_b32_e32 v8, 0
1966 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1967 ; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
1968 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1969 ; GFX10-W32-NEXT:    s_branch .LBB35_2
1970 ; GFX10-W32-NEXT:    .p2align 6
1971 ; GFX10-W32-NEXT:  .LBB35_1: ; %body
1972 ; GFX10-W32-NEXT:    ; in Loop: Header=BB35_2 Depth=1
1973 ; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1974 ; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
1975 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_4
1976 ; GFX10-W32-NEXT:  .LBB35_2: ; %loop
1977 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1978 ; GFX10-W32-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1979 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1980 ; GFX10-W32-NEXT:    v_mov_b32_e32 v7, v3
1981 ; GFX10-W32-NEXT:    v_mov_b32_e32 v6, v2
1982 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v1
1983 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
1984 ; GFX10-W32-NEXT:    s_cbranch_vccz .LBB35_1
1985 ; GFX10-W32-NEXT:  ; %bb.3:
1986 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1987 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
1988 ; GFX10-W32-NEXT:  .LBB35_4: ; %break
1989 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1990 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1991 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
1992 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v5
1993 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v6
1994 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v7
1995 ; GFX10-W32-NEXT:    ; return to shader part epilog
1996 entry:
1997   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
1998   br label %loop
1999
2000 loop:
2001   %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
2002   %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
2003   %cc = fcmp ogt float %ctr.iv, 7.0
2004   br i1 %cc, label %break, label %body
2005
2006 body:
2007   %c.iv0 = extractelement <4 x float> %c.iv, i32 0
2008   %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2009   %ctr.next = fadd float %ctr.iv, 2.0
2010   br label %loop
2011
2012 break:
2013   ret <4 x float> %c.iv
2014 }
2015
2016 ; Only intrinsic stores need exact execution -- other stores do not have
2017 ; externally visible effects and may require WQM for correctness.
2018 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
2019 ; GFX9-W64-LABEL: test_alloca:
2020 ; GFX9-W64:       ; %bb.0: ; %entry
2021 ; GFX9-W64-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2022 ; GFX9-W64-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2023 ; GFX9-W64-NEXT:    s_mov_b32 s10, -1
2024 ; GFX9-W64-NEXT:    s_mov_b32 s11, 0xe00000
2025 ; GFX9-W64-NEXT:    s_add_u32 s8, s8, s0
2026 ; GFX9-W64-NEXT:    s_addc_u32 s9, s9, 0
2027 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2028 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2029 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2030 ; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2031 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2032 ; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0
2033 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2034 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2035 ; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
2036 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
2037 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2038 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2039 ; GFX9-W64-NEXT:    image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf
2040 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1
2041 ; GFX9-W64-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 idxen
2042 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
2043 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2044 ; GFX9-W64-NEXT:    s_endpgm
2045 ;
2046 ; GFX10-W32-LABEL: test_alloca:
2047 ; GFX10-W32:       ; %bb.0: ; %entry
2048 ; GFX10-W32-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2049 ; GFX10-W32-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2050 ; GFX10-W32-NEXT:    s_mov_b32 s10, -1
2051 ; GFX10-W32-NEXT:    s_mov_b32 s11, 0x31c16000
2052 ; GFX10-W32-NEXT:    s_add_u32 s8, s8, s0
2053 ; GFX10-W32-NEXT:    s_addc_u32 s9, s9, 0
2054 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2055 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2056 ; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 0
2057 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2058 ; GFX10-W32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2059 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2060 ; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0
2061 ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2062 ; GFX10-W32-NEXT:    buffer_load_dword v1, v2, s[8:11], 0 offen
2063 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2064 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2065 ; GFX10-W32-NEXT:    image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2066 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1
2067 ; GFX10-W32-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 idxen
2068 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2069 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2070 ; GFX10-W32-NEXT:    s_endpgm
2071 entry:
2072   %array = alloca [32 x i32], align 4, addrspace(5)
2073
2074   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2075
2076   store volatile i32 %a, ptr addrspace(5) %array, align 4
2077
2078   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
2079
2080   %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx
2081   %c = load i32, ptr addrspace(5) %c.gep, align 4
2082   %c.bc = bitcast i32 %c to float
2083   %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2084   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2085
2086   ret void
2087 }
2088
2089 ; Must return to exact at the end of a non-void returning shader,
2090 ; otherwise the EXEC mask exported by the epilog will be wrong. This is true
2091 ; even if the shader has no kills, because a kill could have happened in a
2092 ; previous shader fragment.
2093 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
2094 ; GFX9-W64-LABEL: test_nonvoid_return:
2095 ; GFX9-W64:       ; %bb.0:
2096 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2097 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2098 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2099 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2100 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2101 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2102 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2103 ; GFX9-W64-NEXT:    ; return to shader part epilog
2104 ;
2105 ; GFX10-W32-LABEL: test_nonvoid_return:
2106 ; GFX10-W32:       ; %bb.0:
2107 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2108 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2109 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2110 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2111 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2112 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2113 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2114 ; GFX10-W32-NEXT:    ; return to shader part epilog
2115   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2116   %tex0 = extractelement <4 x float> %tex, i32 0
2117   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2118   ret <4 x float> %dtex
2119 }
2120
2121 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
2122 ; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
2123 ; GFX9-W64:       ; %bb.0: ; %entry
2124 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2125 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2126 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, exec
2127 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2128 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2129 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2130 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB38_2
2131 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
2132 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2133 ; GFX9-W64-NEXT:    s_branch .LBB38_3
2134 ; GFX9-W64-NEXT:  .LBB38_2: ; %if
2135 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2136 ; GFX9-W64-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2137 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2138 ; GFX9-W64-NEXT:  .LBB38_3:
2139 ;
2140 ; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
2141 ; GFX10-W32:       ; %bb.0: ; %entry
2142 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2143 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2144 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, exec_lo
2145 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2146 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2147 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2148 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB38_2
2149 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
2150 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2151 ; GFX10-W32-NEXT:    s_branch .LBB38_3
2152 ; GFX10-W32-NEXT:  .LBB38_2: ; %if
2153 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2154 ; GFX10-W32-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2155 ; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2156 ; GFX10-W32-NEXT:  .LBB38_3:
2157 entry:
2158   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2159   %tex0 = extractelement <4 x float> %tex, i32 0
2160   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2161   %cc = icmp sgt i32 %c, 0
2162   br i1 %cc, label %if, label %else
2163
2164 if:
2165   store volatile <4 x float> %dtex, ptr addrspace(1) undef
2166   unreachable
2167
2168 else:
2169   ret <4 x float> %dtex
2170 }
2171
2172 ; Test awareness that s_wqm_b64 clobbers SCC.
2173 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
2174 ; GFX9-W64-LABEL: test_scc:
2175 ; GFX9-W64:       ; %bb.0: ; %main_body
2176 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2177 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2178 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
2179 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2180 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB39_2
2181 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
2182 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2183 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
2184 ; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
2185 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB39_3
2186 ; GFX9-W64-NEXT:    s_branch .LBB39_4
2187 ; GFX9-W64-NEXT:  .LBB39_2:
2188 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2189 ; GFX9-W64-NEXT:  .LBB39_3: ; %if
2190 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2191 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2192 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2193 ; GFX9-W64-NEXT:  .LBB39_4: ; %end
2194 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2195 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
2196 ; GFX9-W64-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2197 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2198 ; GFX9-W64-NEXT:    ; return to shader part epilog
2199 ;
2200 ; GFX10-W32-LABEL: test_scc:
2201 ; GFX10-W32:       ; %bb.0: ; %main_body
2202 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
2203 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2204 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
2205 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2206 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB39_2
2207 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
2208 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
2209 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2210 ; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
2211 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_3
2212 ; GFX10-W32-NEXT:    s_branch .LBB39_4
2213 ; GFX10-W32-NEXT:  .LBB39_2:
2214 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2215 ; GFX10-W32-NEXT:  .LBB39_3: ; %if
2216 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2217 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2218 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2219 ; GFX10-W32-NEXT:  .LBB39_4: ; %end
2220 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
2221 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
2222 ; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2223 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2224 ; GFX10-W32-NEXT:    ; return to shader part epilog
2225 main_body:
2226   %cc = icmp sgt i32 %sel, 0
2227   br i1 %cc, label %if, label %else
2228
2229 if:
2230   %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2231   br label %end
2232
2233 else:
2234   %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2235   br label %end
2236
2237 end:
2238   %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
2239   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2240   ret <4 x float> %r
2241 }
2242
2243 ; Check a case of a block being entirely WQM except for a bit of WWM.
2244 ; There was a bug where it forgot to enter and leave WWM.
2245 define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2246 ; GFX9-W64-LABEL: test_wwm_within_wqm:
2247 ; GFX9-W64:       ; %bb.0: ; %main_body
2248 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2249 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2250 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2251 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2252 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2253 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB40_2
2254 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2255 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2256 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2257 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2258 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2259 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2260 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2261 ; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2262 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2263 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2264 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2265 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2266 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2267 ; GFX9-W64-NEXT:  .LBB40_2: ; %ENDIF
2268 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2269 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2270 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2271 ; GFX9-W64-NEXT:    ; return to shader part epilog
2272 ;
2273 ; GFX10-W32-LABEL: test_wwm_within_wqm:
2274 ; GFX10-W32:       ; %bb.0: ; %main_body
2275 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2276 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2277 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2278 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2279 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2280 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB40_2
2281 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2282 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2283 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2284 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2285 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2286 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2287 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2288 ; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
2289 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2290 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2291 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2292 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2293 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2294 ; GFX10-W32-NEXT:  .LBB40_2: ; %ENDIF
2295 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2296 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2297 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2298 ; GFX10-W32-NEXT:    ; return to shader part epilog
2299 main_body:
2300   %cmp = icmp eq i32 %z, 0
2301   br i1 %cmp, label %IF, label %ENDIF
2302
2303 IF:
2304   %c.bc = bitcast i32 %c to float
2305   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2306   %tex0 = extractelement <4 x float> %tex, i32 0
2307   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2308   %dataf = extractelement <4 x float> %dtex, i32 0
2309   %data1 = fptosi float %dataf to i32
2310   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2311   %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2312   %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
2313   %data4f = sitofp i32 %data4 to float
2314   br label %ENDIF
2315
2316 ENDIF:
2317   %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2318   ret float %r
2319 }
2320
2321 ; Check that WWM is triggered by the strict_wwm intrinsic.
2322 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
2323 ; GFX9-W64-LABEL: test_strict_wwm1:
2324 ; GFX9-W64:       ; %bb.0: ; %main_body
2325 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2326 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2327 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2328 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2329 ; GFX9-W64-NEXT:    s_nop 0
2330 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2331 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2332 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2333 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2334 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2335 ; GFX9-W64-NEXT:    ; return to shader part epilog
2336 ;
2337 ; GFX10-W32-LABEL: test_strict_wwm1:
2338 ; GFX10-W32:       ; %bb.0: ; %main_body
2339 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2340 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2341 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2342 ; GFX10-W32-NEXT:    s_clause 0x1
2343 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2344 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2345 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2346 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2347 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2348 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2349 ; GFX10-W32-NEXT:    ; return to shader part epilog
2350 main_body:
2351   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2352   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2353   %out = fadd float %src0, %src1
2354   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2355   ret float %out.0
2356 }
2357
2358 ; Same as above, but with an integer type.
2359 define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
2360 ; GFX9-W64-LABEL: test_strict_wwm2:
2361 ; GFX9-W64:       ; %bb.0: ; %main_body
2362 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2363 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2364 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2365 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2366 ; GFX9-W64-NEXT:    s_nop 0
2367 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2368 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2369 ; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
2370 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2371 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2372 ; GFX9-W64-NEXT:    ; return to shader part epilog
2373 ;
2374 ; GFX10-W32-LABEL: test_strict_wwm2:
2375 ; GFX10-W32:       ; %bb.0: ; %main_body
2376 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2377 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2378 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2379 ; GFX10-W32-NEXT:    s_clause 0x1
2380 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2381 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2382 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2383 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2384 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2385 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2386 ; GFX10-W32-NEXT:    ; return to shader part epilog
2387 main_body:
2388   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2389   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2390   %src0.0 = bitcast float %src0 to i32
2391   %src1.0 = bitcast float %src1 to i32
2392   %out = add i32 %src0.0, %src1.0
2393   %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2394   %out.1 = bitcast i32 %out.0 to float
2395   ret float %out.1
2396 }
2397
2398 ; Check that we don't leave WWM on for computations that don't require WWM,
2399 ; since that will lead clobbering things that aren't supposed to be clobbered
2400 ; in cases like this.
2401 ; We enforce this by checking that v_add gets emitted in the same block as
2402 ; WWM computations.
2403 define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
2404 ; GFX9-W64-LABEL: test_strict_wwm3:
2405 ; GFX9-W64:       ; %bb.0: ; %main_body
2406 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2407 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2408 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2409 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2410 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2411 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB43_2
2412 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
2413 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2414 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2415 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2416 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2417 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
2418 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2419 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2420 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
2421 ; GFX9-W64-NEXT:  .LBB43_2: ; %endif
2422 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2423 ; GFX9-W64-NEXT:    ; return to shader part epilog
2424 ;
2425 ; GFX10-W32-LABEL: test_strict_wwm3:
2426 ; GFX10-W32:       ; %bb.0: ; %main_body
2427 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2428 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2429 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2430 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2431 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2432 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB43_2
2433 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
2434 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2435 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2436 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2437 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2438 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
2439 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2440 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2441 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
2442 ; GFX10-W32-NEXT:  .LBB43_2: ; %endif
2443 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2444 ; GFX10-W32-NEXT:    ; return to shader part epilog
2445 main_body:
2446   ; use mbcnt to make sure the branch is divergent
2447   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2448   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2449   %cc = icmp uge i32 %hi, 16
2450   br i1 %cc, label %endif, label %if
2451
2452 if:
2453   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2454   %out = fadd float %src, %src
2455   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2456   %out.1 = fadd float %src, %out.0
2457   br label %endif
2458
2459 endif:
2460   %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2461   ret float %out.2
2462 }
2463
2464 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
2465 ; write could clobber disabled channels in the non-WWM one.
2466 ; We enforce this by checking that v_mov gets emitted in the same block as
2467 ; WWM computations.
2468 define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
2469 ; GFX9-W64-LABEL: test_strict_wwm4:
2470 ; GFX9-W64:       ; %bb.0: ; %main_body
2471 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2472 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2473 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2474 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2475 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2476 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB44_2
2477 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
2478 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2479 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2480 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2481 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2482 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2483 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2484 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2485 ; GFX9-W64-NEXT:  .LBB44_2: ; %endif
2486 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2487 ; GFX9-W64-NEXT:    ; return to shader part epilog
2488 ;
2489 ; GFX10-W32-LABEL: test_strict_wwm4:
2490 ; GFX10-W32:       ; %bb.0: ; %main_body
2491 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2492 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2493 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2494 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2495 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2496 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB44_2
2497 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
2498 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2499 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2500 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2501 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2502 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2503 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2504 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2505 ; GFX10-W32-NEXT:  .LBB44_2: ; %endif
2506 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2507 ; GFX10-W32-NEXT:    ; return to shader part epilog
2508 main_body:
2509   ; use mbcnt to make sure the branch is divergent
2510   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2511   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2512   %cc = icmp uge i32 %hi, 16
2513   br i1 %cc, label %endif, label %if
2514
2515 if:
2516   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2517   %out = fadd float %src, %src
2518   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2519   br label %endif
2520
2521 endif:
2522   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2523   ret float %out.1
2524 }
2525
2526 ; Make sure the transition from Exact to WWM then WQM works properly.
2527 define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
2528 ; GFX9-W64-LABEL: test_strict_wwm5:
2529 ; GFX9-W64:       ; %bb.0: ; %main_body
2530 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2531 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
2532 ; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2533 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2534 ; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2535 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2536 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
2537 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2538 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2539 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2540 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2541 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2542 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2543 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
2544 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2545 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2546 ; GFX9-W64-NEXT:    ; return to shader part epilog
2547 ;
2548 ; GFX10-W32-LABEL: test_strict_wwm5:
2549 ; GFX10-W32:       ; %bb.0: ; %main_body
2550 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
2551 ; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
2552 ; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2553 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2554 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
2555 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2556 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2557 ; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2558 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2559 ; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2560 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2561 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2562 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2563 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2564 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2565 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
2566 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2567 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
2568 ; GFX10-W32-NEXT:    ; return to shader part epilog
2569 main_body:
2570   %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2571   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2572   %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2573   %temp = fadd float %src1, %src1
2574   %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
2575   %out = fadd float %temp.0, %temp.0
2576   %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
2577   ret float %out.0
2578 }
2579
2580 ; Check that WWM is turned on correctly across basic block boundaries.
2581 ; if..then..endif version
2582 ;SI-CHECK: buffer_load_dword
2583 ;VI-CHECK: flat_load_dword
2584 ;SI-CHECK: buffer_load_dword
2585 ;VI-CHECK: flat_load_dword
2586 define amdgpu_ps float @test_strict_wwm6_then() {
2587 ; GFX9-W64-LABEL: test_strict_wwm6_then:
2588 ; GFX9-W64:       ; %bb.0: ; %main_body
2589 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2590 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2591 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2592 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2593 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2594 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2595 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2596 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2597 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2598 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB46_2
2599 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
2600 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2601 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2602 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2603 ; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2604 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2605 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2606 ; GFX9-W64-NEXT:  .LBB46_2: ; %endif
2607 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2608 ; GFX9-W64-NEXT:    ; return to shader part epilog
2609 ;
2610 ; GFX10-W32-LABEL: test_strict_wwm6_then:
2611 ; GFX10-W32:       ; %bb.0: ; %main_body
2612 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2613 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2614 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2615 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2616 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2617 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2618 ; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2619 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2620 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2621 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
2622 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
2623 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2624 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2625 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2626 ; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2627 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2628 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2629 ; GFX10-W32-NEXT:  .LBB46_2: ; %endif
2630 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2631 ; GFX10-W32-NEXT:    ; return to shader part epilog
2632 main_body:
2633   %src0 = load volatile float, ptr addrspace(1) undef
2634   ; use mbcnt to make sure the branch is divergent
2635   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2636   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2637   %cc = icmp uge i32 %hi, 16
2638   br i1 %cc, label %endif, label %if
2639
2640 if:
2641   %src1 = load volatile float, ptr addrspace(1) undef
2642   %out = fadd float %src0, %src1
2643   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2644   br label %endif
2645
2646 endif:
2647   %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2648   ret float %out.1
2649 }
2650
2651 ; Check that WWM is turned on correctly across basic block boundaries.
2652 ; loop version
2653 define amdgpu_ps float @test_strict_wwm6_loop() {
2654 ; GFX9-W64-LABEL: test_strict_wwm6_loop:
2655 ; GFX9-W64:       ; %bb.0: ; %main_body
2656 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2657 ; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2658 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2659 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2660 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2661 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2662 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
2663 ; GFX9-W64-NEXT:  .LBB47_1: ; %loop
2664 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2665 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2666 ; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2667 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2668 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2669 ; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
2670 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2671 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2672 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
2673 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2674 ; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2675 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2676 ; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2677 ; GFX9-W64-NEXT:    s_cbranch_execnz .LBB47_1
2678 ; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
2679 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2680 ; GFX9-W64-NEXT:    ; return to shader part epilog
2681 ;
2682 ; GFX10-W32-LABEL: test_strict_wwm6_loop:
2683 ; GFX10-W32:       ; %bb.0: ; %main_body
2684 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2685 ; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2686 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2687 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2688 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2689 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2690 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2691 ; GFX10-W32-NEXT:  .LBB47_1: ; %loop
2692 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2693 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2694 ; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2695 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2696 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2697 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
2698 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2699 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
2700 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2701 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2702 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2703 ; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
2704 ; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
2705 ; GFX10-W32-NEXT:    s_cbranch_execnz .LBB47_1
2706 ; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
2707 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2708 ; GFX10-W32-NEXT:    ; return to shader part epilog
2709 main_body:
2710   %src0 = load volatile float, ptr addrspace(1) undef
2711   ; use mbcnt to make sure the branch is divergent
2712   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2713   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2714   br label %loop
2715
2716 loop:
2717   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
2718   %src1 = load volatile float, ptr addrspace(1) undef
2719   %out = fadd float %src0, %src1
2720   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2721   %counter.1 = sub i32 %counter, 1
2722   %cc = icmp ne i32 %counter.1, 0
2723   br i1 %cc, label %loop, label %endloop
2724
2725 endloop:
2726   ret float %out.0
2727 }
2728
2729 ; Check that @llvm.amdgcn.set.inactive disables WWM.
2730 define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
2731 ; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
2732 ; GFX9-W64:       ; %bb.0: ; %main_body
2733 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2734 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2735 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2736 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2737 ; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
2738 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
2739 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2740 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2741 ; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2742 ; GFX9-W64-NEXT:    s_endpgm
2743 ;
2744 ; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
2745 ; GFX10-W32:       ; %bb.0: ; %main_body
2746 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2747 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2748 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2749 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2750 ; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
2751 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
2752 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2753 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2754 ; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2755 ; GFX10-W32-NEXT:    s_endpgm
2756 main_body:
2757   %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2758   %src.0 = bitcast float %src to i32
2759   %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
2760   %out = add i32 %src.1, %src.1
2761   %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2762   %out.1 = bitcast i32 %out.0 to float
2763   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2764   ret void
2765 }
2766
2767 ; Check a case of a block being entirely WQM except for a bit of WWM.
2768 ; There was a bug where it forgot to enter and leave WWM.
2769 define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2770 ; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
2771 ; GFX9-W64:       ; %bb.0: ; %main_body
2772 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2773 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2774 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2775 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2776 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2777 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB49_2
2778 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2779 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2780 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2781 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2782 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2783 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2784 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2785 ; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2786 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2787 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2788 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2789 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2790 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2791 ; GFX9-W64-NEXT:  .LBB49_2: ; %ENDIF
2792 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2793 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2794 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2795 ; GFX9-W64-NEXT:    ; return to shader part epilog
2796 ;
2797 ; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
2798 ; GFX10-W32:       ; %bb.0: ; %main_body
2799 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2800 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2801 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2802 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2803 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2804 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB49_2
2805 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2806 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2807 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2808 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2809 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2810 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2811 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2812 ; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
2813 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2814 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2815 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2816 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2817 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2818 ; GFX10-W32-NEXT:  .LBB49_2: ; %ENDIF
2819 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2820 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2821 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2822 ; GFX10-W32-NEXT:    ; return to shader part epilog
2823 main_body:
2824   %cmp = icmp eq i32 %z, 0
2825   br i1 %cmp, label %IF, label %ENDIF
2826
2827 IF:
2828   %c.bc = bitcast i32 %c to float
2829   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2830   %tex0 = extractelement <4 x float> %tex, i32 0
2831   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2832   %dataf = extractelement <4 x float> %dtex, i32 0
2833   %data1 = fptosi float %dataf to i32
2834   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2835   %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2836   %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
2837   %data4f = sitofp i32 %data4 to float
2838   br label %ENDIF
2839
2840 ENDIF:
2841   %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2842   ret float %r
2843 }
2844
2845 ; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
2846 define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2847 ; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
2848 ; GFX9-W64:       ; %bb.0: ; %main_body
2849 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2850 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2851 ; GFX9-W64-NEXT:    s_mov_b64 s[14:15], exec
2852 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2853 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2854 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[14:15]
2855 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2856 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2857 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2858 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB50_2
2859 ; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2860 ; GFX9-W64-NEXT:    s_mov_b64 s[16:17], exec
2861 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2862 ; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2863 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2864 ; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2865 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2866 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v2
2867 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2868 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
2869 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2870 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2871 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v0, v0
2872 ; GFX9-W64-NEXT:  .LBB50_2: ; %ENDIF
2873 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2874 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2875 ; GFX9-W64-NEXT:    ; return to shader part epilog
2876 ;
2877 ; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
2878 ; GFX10-W32:       ; %bb.0: ; %main_body
2879 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2880 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2881 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2882 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2883 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2884 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
2885 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2886 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2887 ; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
2888 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB50_2
2889 ; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2890 ; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo
2891 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2892 ; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2893 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2894 ; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2895 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2896 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v2
2897 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2898 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
2899 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2900 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2901 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v0, v0
2902 ; GFX10-W32-NEXT:  .LBB50_2: ; %ENDIF
2903 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2904 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2905 ; GFX10-W32-NEXT:    ; return to shader part epilog
2906 main_body:
2907   %cmp = icmp eq i32 %z, 0
2908   br i1 %cmp, label %IF, label %ENDIF
2909
2910 IF:
2911   %c.bc = bitcast i32 %c to float
2912   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2913   %tex0 = extractelement <4 x float> %tex, i32 0
2914   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2915   %dataf = extractelement <4 x float> %dtex, i32 0
2916   %data1 = fptosi float %dataf to i32
2917   %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
2918   %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
2919   %data3f = sitofp i32 %data3 to float
2920   br label %ENDIF
2921
2922 ENDIF:
2923   %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
2924   ret float %r
2925 }
2926
2927 ; WQM -> StrictWQM transition must be preserved because kill breaks WQM mask
2928 define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data, i32 %wqm_data) {
2929 ; GFX9-W64-LABEL: test_strict_wqm_within_wqm_with_kill:
2930 ; GFX9-W64:       ; %bb.0: ; %main_body
2931 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2932 ; GFX9-W64-NEXT:    s_mov_b64 s[14:15], exec
2933 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2934 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
2935 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[14:15]
2936 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2937 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2938 ; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2939 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2940 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2941 ; GFX9-W64-NEXT:    s_andn2_b64 s[0:1], exec, vcc
2942 ; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[0:1]
2943 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB51_2
2944 ; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
2945 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, vcc
2946 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2947 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2948 ; GFX9-W64-NEXT:    ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2949 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2950 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2951 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v3
2952 ; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v1
2953 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2954 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
2955 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2956 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2957 ; GFX9-W64-NEXT:    s_branch .LBB51_3
2958 ; GFX9-W64-NEXT:  .LBB51_2:
2959 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
2960 ; GFX9-W64-NEXT:    exp null off, off, off, off done vm
2961 ; GFX9-W64-NEXT:    s_endpgm
2962 ; GFX9-W64-NEXT:  .LBB51_3:
2963 ;
2964 ; GFX10-W32-LABEL: test_strict_wqm_within_wqm_with_kill:
2965 ; GFX10-W32:       ; %bb.0: ; %main_body
2966 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2967 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2968 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2969 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
2970 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
2971 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2972 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2973 ; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2974 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2975 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2976 ; GFX10-W32-NEXT:    s_andn2_b32 s0, exec_lo, vcc_lo
2977 ; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, s0
2978 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB51_2
2979 ; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
2980 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, vcc_lo
2981 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2982 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2983 ; GFX10-W32-NEXT:    ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2984 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2985 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2986 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v3
2987 ; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v1
2988 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2989 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
2990 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2991 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2992 ; GFX10-W32-NEXT:    s_branch .LBB51_3
2993 ; GFX10-W32-NEXT:  .LBB51_2:
2994 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
2995 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
2996 ; GFX10-W32-NEXT:    s_endpgm
2997 ; GFX10-W32-NEXT:  .LBB51_3:
2998 main_body:
2999   %c.bc = bitcast i32 %c to float
3000   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3001   %tex0 = extractelement <4 x float> %tex, i32 0
3002   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3003   %cmp = icmp eq i32 %z, 0
3004   call void @llvm.amdgcn.kill(i1 %cmp)
3005   %dataf = extractelement <4 x float> %dtex, i32 0
3006   %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %wqm_data, i32 2079)
3007   %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
3008   %data3f = sitofp i32 %data3 to float
3009   %result.f = fadd float %dataf, %data3f
3010   %result.i = bitcast float %result.f to i32
3011   %result.wqm = call i32 @llvm.amdgcn.wqm.i32(i32 %result.i)
3012   %result = bitcast i32 %result.wqm to float
3013   ret float %result
3014 }
3015
3016 ;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
3017 define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) {
3018 ; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
3019 ; GFX9-W64:       ; %bb.0: ; %main_body
3020 ; GFX9-W64-NEXT:    s_mov_b64 s[28:29], exec
3021 ; GFX9-W64-NEXT:    s_mov_b32 s19, s17
3022 ; GFX9-W64-NEXT:    s_mov_b64 s[30:31], exec
3023 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3024 ; GFX9-W64-NEXT:    s_mov_b32 s23, s5
3025 ; GFX9-W64-NEXT:    s_mov_b32 s22, s4
3026 ; GFX9-W64-NEXT:    s_mov_b32 s21, s3
3027 ; GFX9-W64-NEXT:    s_mov_b32 s20, s2
3028 ; GFX9-W64-NEXT:    s_mov_b32 s27, s9
3029 ; GFX9-W64-NEXT:    s_mov_b32 s26, s8
3030 ; GFX9-W64-NEXT:    s_mov_b32 s25, s7
3031 ; GFX9-W64-NEXT:    s_mov_b32 s24, s6
3032 ; GFX9-W64-NEXT:    s_mov_b32 s18, s16
3033 ; GFX9-W64-NEXT:    s_mov_b32 s17, s15
3034 ; GFX9-W64-NEXT:    s_mov_b32 s16, s14
3035 ; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3036 ; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3037 ; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3038 ; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3039 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
3040 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[30:31]
3041 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3042 ; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
3043 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3044 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
3045 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3046 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3047 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, s0
3048 ; GFX9-W64-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
3049 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3050 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3051 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3052 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3053 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3054 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3055 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3056 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3057 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3058 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3059 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3060 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3061 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[28:29]
3062 ; GFX9-W64-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
3063 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3064 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3065 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3066 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3067 ; GFX9-W64-NEXT:    ; return to shader part epilog
3068 ;
3069 ; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
3070 ; GFX10-W32:       ; %bb.0: ; %main_body
3071 ; GFX10-W32-NEXT:    s_mov_b32 s28, exec_lo
3072 ; GFX10-W32-NEXT:    s_mov_b32 s19, s17
3073 ; GFX10-W32-NEXT:    s_mov_b32 s29, exec_lo
3074 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3075 ; GFX10-W32-NEXT:    s_mov_b32 s23, s5
3076 ; GFX10-W32-NEXT:    s_mov_b32 s22, s4
3077 ; GFX10-W32-NEXT:    s_mov_b32 s21, s3
3078 ; GFX10-W32-NEXT:    s_mov_b32 s20, s2
3079 ; GFX10-W32-NEXT:    s_mov_b32 s27, s9
3080 ; GFX10-W32-NEXT:    s_mov_b32 s26, s8
3081 ; GFX10-W32-NEXT:    s_mov_b32 s25, s7
3082 ; GFX10-W32-NEXT:    s_mov_b32 s24, s6
3083 ; GFX10-W32-NEXT:    s_mov_b32 s18, s16
3084 ; GFX10-W32-NEXT:    s_mov_b32 s17, s15
3085 ; GFX10-W32-NEXT:    s_mov_b32 s16, s14
3086 ; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3087 ; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3088 ; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3089 ; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3090 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
3091 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s29
3092 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3093 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
3094 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3095 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
3096 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3097 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
3098 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s0
3099 ; GFX10-W32-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
3100 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3101 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3102 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3103 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3104 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3105 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3106 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3107 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3108 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3109 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3110 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3111 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3112 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
3113 ; GFX10-W32-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
3114 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3115 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3116 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3117 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3118 ; GFX10-W32-NEXT:    ; return to shader part epilog
3119 main_body:
3120   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3121   %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3122   %temp = fadd float %reload, %reload
3123   %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
3124   %temp3 = fadd float %temp2, %temp2
3125   %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0)
3126   %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
3127   %temp5 = fadd float %temp3, %temp4
3128   %res.int = ptrtoint ptr addrspace(8) %res to i128
3129   %res.vec = bitcast i128 %res.int to <4 x i32>
3130   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3131   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3132   %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3133   ret float %out
3134 }
3135
3136 define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3137 ; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
3138 ; GFX9-W64:       ; %bb.0: ; %main_body
3139 ; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3140 ; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3141 ; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3142 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3143 ; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3144 ; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3145 ; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3146 ; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3147 ; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3148 ; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3149 ; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3150 ; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3151 ; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3152 ; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3153 ; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3154 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3155 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3156 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3157 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3158 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
3159 ; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3160 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3161 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3162 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3163 ; GFX9-W64-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3164 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3165 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
3166 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3167 ; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3168 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3169 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3170 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3171 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3172 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3173 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3174 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3175 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3176 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3177 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3178 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3179 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3180 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3181 ; GFX9-W64-NEXT:    ; return to shader part epilog
3182 ;
3183 ; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
3184 ; GFX10-W32:       ; %bb.0: ; %main_body
3185 ; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3186 ; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3187 ; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3188 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3189 ; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3190 ; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3191 ; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3192 ; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3193 ; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3194 ; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3195 ; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3196 ; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3197 ; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3198 ; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3199 ; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3200 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3201 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3202 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3203 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
3204 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3205 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3206 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3207 ; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3208 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3209 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3210 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3211 ; GFX10-W32-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3212 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3213 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3214 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3215 ; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3216 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3217 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3218 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3219 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3220 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3221 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3222 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3223 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3224 ; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3225 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3226 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3227 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3228 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3229 ; GFX10-W32-NEXT:    ; return to shader part epilog
3230 main_body:
3231   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3232   %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3233   %temp = fadd float %reload, %reload
3234   %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
3235   %temp3 = fadd float %temp2, %temp2
3236   %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3237   %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3238   %temp5 = fadd float %temp3, %temp4
3239   %res.int = ptrtoint ptr addrspace(8) %res to i128
3240   %res.vec = bitcast i128 %res.int to <4 x i32>
3241   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3242   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3243   %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3244   ret float %out
3245 }
3246
3247 ;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
3248 define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3249 ; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
3250 ; GFX9-W64:       ; %bb.0: ; %main_body
3251 ; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3252 ; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3253 ; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3254 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3255 ; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3256 ; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3257 ; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3258 ; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3259 ; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3260 ; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3261 ; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3262 ; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3263 ; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3264 ; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3265 ; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3266 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3267 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3268 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3269 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3270 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
3271 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 idxen
3272 ; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3273 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3274 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3275 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3276 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3277 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3278 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3279 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3280 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
3281 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3282 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3283 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v3
3284 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3285 ; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3286 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3287 ; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3288 ; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3289 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3290 ; GFX9-W64-NEXT:    ; return to shader part epilog
3291 ;
3292 ; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
3293 ; GFX10-W32:       ; %bb.0: ; %main_body
3294 ; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3295 ; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3296 ; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3297 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3298 ; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3299 ; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3300 ; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3301 ; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3302 ; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3303 ; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3304 ; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3305 ; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3306 ; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3307 ; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3308 ; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3309 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3310 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3311 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3312 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s1
3313 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3314 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3315 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3316 ; GFX10-W32-NEXT:    buffer_load_dword v0, v3, s[16:19], 0 idxen
3317 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3318 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3319 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3320 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3321 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3322 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3323 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3324 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
3325 ; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3326 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3327 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3328 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v3
3329 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3330 ; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3331 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3332 ; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3333 ; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3334 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3335 ; GFX10-W32-NEXT:    ; return to shader part epilog
3336 main_body:
3337   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3338   %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3339   %temp = fadd float %reload, %reload
3340   %res.int = ptrtoint ptr addrspace(8) %res to i128
3341   %res.vec = bitcast i128 %res.int to <4 x i32>
3342   %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3343   %temp2 = fadd float %tex, %tex
3344   %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3345   %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3346   %temp4 = fadd float %temp2, %temp3
3347   %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3348   call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3349   %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3350   ret float %out
3351 }
3352
3353 ; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
3354 ; vector comparisons in Wave32 mode.
3355 define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) {
3356 ; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
3357 ; GFX9-W64:       ; %bb.0: ; %main_body
3358 ; GFX9-W64-NEXT:    s_mov_b32 s3, 0x31016fac
3359 ; GFX9-W64-NEXT:    s_mov_b32 s2, 32
3360 ; GFX9-W64-NEXT:    s_mov_b32 s1, 0x8000
3361 ; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3362 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3363 ; GFX9-W64-NEXT:    v_cmp_le_f32_e64 vcc, s0, 0
3364 ; GFX9-W64-NEXT:    s_andn2_b64 s[4:5], exec, vcc
3365 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB55_1
3366 ; GFX9-W64-NEXT:    s_endpgm
3367 ; GFX9-W64-NEXT:  .LBB55_1:
3368 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3369 ; GFX9-W64-NEXT:    exp null off, off, off, off done vm
3370 ; GFX9-W64-NEXT:    s_endpgm
3371 ;
3372 ; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
3373 ; GFX10-W32:       ; %bb.0: ; %main_body
3374 ; GFX10-W32-NEXT:    s_mov_b32 s3, 0x31016fac
3375 ; GFX10-W32-NEXT:    s_mov_b32 s2, 32
3376 ; GFX10-W32-NEXT:    s_mov_b32 s1, 0x8000
3377 ; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3378 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3379 ; GFX10-W32-NEXT:    v_cmp_le_f32_e64 vcc_lo, s0, 0
3380 ; GFX10-W32-NEXT:    s_andn2_b32 s4, exec_lo, vcc_lo
3381 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB55_1
3382 ; GFX10-W32-NEXT:    s_endpgm
3383 ; GFX10-W32-NEXT:  .LBB55_1:
3384 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3385 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
3386 ; GFX10-W32-NEXT:    s_endpgm
3387 main_body:
3388   %1 = ptrtoint ptr addrspace(6) %0 to i32
3389   %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
3390   %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
3391   %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
3392   call void @llvm.amdgcn.kill(i1 %4) #1
3393   ret void
3394 }
3395
3396 ; Test the interaction between wqm and llvm.amdgcn.init.exec.
3397 define amdgpu_gs void @wqm_init_exec() {
3398 ; GFX9-W64-LABEL: wqm_init_exec:
3399 ; GFX9-W64:       ; %bb.0: ; %bb
3400 ; GFX9-W64-NEXT:    s_mov_b64 exec, -1
3401 ; GFX9-W64-NEXT:    s_mov_b32 s0, 0
3402 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
3403 ; GFX9-W64-NEXT:    s_mov_b32 s1, s0
3404 ; GFX9-W64-NEXT:    s_mov_b32 s2, s0
3405 ; GFX9-W64-NEXT:    s_mov_b32 s3, s0
3406 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v0
3407 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
3408 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v0
3409 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3410 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3411 ; GFX9-W64-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $exec
3412 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3413 ; GFX9-W64-NEXT:    ds_write_b32 v0, v1
3414 ; GFX9-W64-NEXT:    s_endpgm
3415 ;
3416 ; GFX10-W32-LABEL: wqm_init_exec:
3417 ; GFX10-W32:       ; %bb.0: ; %bb
3418 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, -1
3419 ; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
3420 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
3421 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
3422 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3423 ; GFX10-W32-NEXT:    s_mov_b32 s2, s0
3424 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
3425 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v0
3426 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
3427 ; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v0
3428 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, s0
3429 ; GFX10-W32-NEXT:    s_mov_b32 s1, s0
3430 ; GFX10-W32-NEXT:    s_mov_b32 s3, s0
3431 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3432 ; GFX10-W32-NEXT:    ds_write_b32 v0, v4
3433 ; GFX10-W32-NEXT:    s_endpgm
3434 bb:
3435   call void @llvm.amdgcn.init.exec(i64 -1)
3436   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
3437   %i = call i32 @llvm.amdgcn.wqm.i32(i32 0)
3438   store i32 %i, i32 addrspace(3)* null, align 4
3439   ret void
3440 }
3441
3442 ; Test a case that failed machine verification.
3443 define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) {
3444 ; GFX9-W64-LABEL: wqm_init_exec_switch:
3445 ; GFX9-W64:       ; %bb.0:
3446 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3447 ; GFX9-W64-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v0
3448 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3449 ; GFX9-W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3450 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3451 ; GFX9-W64-NEXT:    s_endpgm
3452 ;
3453 ; GFX10-W32-LABEL: wqm_init_exec_switch:
3454 ; GFX10-W32:       ; %bb.0:
3455 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3456 ; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3457 ; GFX10-W32-NEXT:    v_cmpx_lt_i32_e32 0, v0
3458 ; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
3459 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
3460 ; GFX10-W32-NEXT:    s_endpgm
3461   call void @llvm.amdgcn.init.exec(i64 0)
3462   switch i32 %arg, label %bb1 [
3463     i32 0, label %bb3
3464     i32 1, label %bb2
3465   ]
3466 bb1:
3467   ret void
3468 bb2:
3469   ret void
3470 bb3:
3471   ret void
3472 }
3473
3474 define amdgpu_gs void @wqm_init_exec_wwm() {
3475 ; GFX9-W64-LABEL: wqm_init_exec_wwm:
3476 ; GFX9-W64:       ; %bb.0:
3477 ; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3478 ; GFX9-W64-NEXT:    s_mov_b32 s1, 0
3479 ; GFX9-W64-NEXT:    s_mov_b32 s0, s1
3480 ; GFX9-W64-NEXT:    s_cmp_lg_u64 exec, 0
3481 ; GFX9-W64-NEXT:    s_cselect_b64 s[2:3], -1, 0
3482 ; GFX9-W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
3483 ; GFX9-W64-NEXT:    s_cselect_b64 s[0:1], -1, 0
3484 ; GFX9-W64-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
3485 ; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
3486 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
3487 ; GFX9-W64-NEXT:    exp mrt0 off, off, off, off
3488 ; GFX9-W64-NEXT:    s_endpgm
3489 ;
3490 ; GFX10-W32-LABEL: wqm_init_exec_wwm:
3491 ; GFX10-W32:       ; %bb.0:
3492 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3493 ; GFX10-W32-NEXT:    s_mov_b32 s1, 0
3494 ; GFX10-W32-NEXT:    s_cmp_lg_u64 exec, 0
3495 ; GFX10-W32-NEXT:    s_mov_b32 s0, s1
3496 ; GFX10-W32-NEXT:    s_cselect_b32 s2, -1, 0
3497 ; GFX10-W32-NEXT:    s_cmp_lg_u64 s[0:1], 0
3498 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
3499 ; GFX10-W32-NEXT:    s_cselect_b32 s0, -1, 0
3500 ; GFX10-W32-NEXT:    s_xor_b32 s0, s2, s0
3501 ; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
3502 ; GFX10-W32-NEXT:    exp mrt0 off, off, off, off
3503 ; GFX10-W32-NEXT:    s_endpgm
3504   call void @llvm.amdgcn.init.exec(i64 0)
3505   %i = call i64 @llvm.amdgcn.ballot.i64(i1 true)
3506   %i1 = call i32 @llvm.amdgcn.wwm.i32(i32 0)
3507   %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 0
3508   %i3 = bitcast <2 x i32> %i2 to i64
3509   %i4 = icmp ne i64 %i, 0
3510   %i5 = icmp ne i64 %i3, 0
3511   %i6 = xor i1 %i4, %i5
3512   %i7 = uitofp i1 %i6 to float
3513   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %i7, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
3514   ret void
3515 }
3516
3517 ; Check that exact regions with execz affected instructions are as short as possible
3518 define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3519 ; GFX9-W64-LABEL: short_exact_regions:
3520 ; GFX9-W64:       ; %bb.0: ; %main_body
3521 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
3522 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3523 ; GFX9-W64-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
3524 ; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
3525 ; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
3526 ; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
3527 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
3528 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB59_2
3529 ; GFX9-W64-NEXT:  ; %bb.1: ; %if
3530 ; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
3531 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3532 ; GFX9-W64-NEXT:    v_readfirstlane_b32 s16, v0
3533 ; GFX9-W64-NEXT:    s_buffer_load_dword s16, s[8:11], s16 offset:0x0
3534 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3535 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s16
3536 ; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
3537 ; GFX9-W64-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3538 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
3539 ; GFX9-W64-NEXT:  .LBB59_2: ; %endif
3540 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
3541 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3542 ; GFX9-W64-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
3543 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3544 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v0
3545 ; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
3546 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
3547 ; GFX9-W64-NEXT:    ; return to shader part epilog
3548 ;
3549 ; GFX10-W32-LABEL: short_exact_regions:
3550 ; GFX10-W32:       ; %bb.0: ; %main_body
3551 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
3552 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3553 ; GFX10-W32-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
3554 ; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
3555 ; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
3556 ; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
3557 ; GFX10-W32-NEXT:    v_cmpx_gt_u32_e32 16, v0
3558 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB59_2
3559 ; GFX10-W32-NEXT:  ; %bb.1: ; %if
3560 ; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
3561 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3562 ; GFX10-W32-NEXT:    v_readfirstlane_b32 s14, v0
3563 ; GFX10-W32-NEXT:    s_buffer_load_dword s14, s[8:11], s14 offset:0x0
3564 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3565 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s14
3566 ; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
3567 ; GFX10-W32-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3568 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
3569 ; GFX10-W32-NEXT:  .LBB59_2: ; %endif
3570 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
3571 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3572 ; GFX10-W32-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3573 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3574 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v0
3575 ; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
3576 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
3577 ; GFX10-W32-NEXT:    ; return to shader part epilog
3578 main_body:
3579   %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3580   %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3581   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3582   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3583   %cc = icmp uge i32 %hi, 16
3584   br i1 %cc, label %endif, label %if
3585
3586 if:
3587   %idx1 = extractelement <4 x i32> %idx0, i64 0
3588   %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3589   %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0)
3590
3591   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> undef, i32 %idx3, i32 0, i32 0, i32 0)
3592   br label %endif
3593
3594 endif:
3595   %d = extractelement <4 x float> %tex1, i64 0
3596   %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3597   %r0 = extractelement <4 x float> %tex1, i64 1
3598   %r1 = extractelement <4 x float> %tex2, i64 2
3599   %r2 = fadd float %r0, %r1
3600   %out = call float @llvm.amdgcn.wqm.f32(float %r2)
3601
3602   ret float %out
3603 }
3604
3605 ; Check that exact regions shortening doesn't prevent early WQM exit
3606 define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3607 ; GFX9-W64-LABEL: short_exact_regions_2:
3608 ; GFX9-W64:       ; %bb.0: ; %main_body
3609 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
3610 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3611 ; GFX9-W64-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3
3612 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
3613 ; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
3614 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3615 ; GFX9-W64-NEXT:    image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
3616 ; GFX9-W64-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
3617 ; GFX9-W64-NEXT:    ; kill: killed $vgpr3
3618 ; GFX9-W64-NEXT:    ; kill: killed $vgpr1 killed $vgpr2
3619 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3620 ; GFX9-W64-NEXT:    v_readfirstlane_b32 s0, v0
3621 ; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3622 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3623 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v5
3624 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3625 ; GFX9-W64-NEXT:    v_add_f32_e32 v0, s0, v0
3626 ; GFX9-W64-NEXT:    ; return to shader part epilog
3627 ;
3628 ; GFX10-W32-LABEL: short_exact_regions_2:
3629 ; GFX10-W32:       ; %bb.0: ; %main_body
3630 ; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
3631 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3632 ; GFX10-W32-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
3633 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
3634 ; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
3635 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3636 ; GFX10-W32-NEXT:    image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3637 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3638 ; GFX10-W32-NEXT:    v_readfirstlane_b32 s0, v0
3639 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3640 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v1
3641 ; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3642 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3643 ; GFX10-W32-NEXT:    v_add_f32_e32 v0, s0, v0
3644 ; GFX10-W32-NEXT:    ; return to shader part epilog
3645 main_body:
3646   %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3647   %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3648   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3649   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3650   %idx1 = extractelement <4 x i32> %idx0, i64 0
3651   %d = extractelement <4 x float> %tex1, i64 0
3652
3653   %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3654
3655   %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3656   %idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0)
3657
3658   %r0 = extractelement <4 x float> %tex1, i64 1
3659   %r1 = extractelement <4 x float> %tex2, i64 2
3660   %r2 = fadd float %r0, %r1
3661   %out = fadd float %r2, %idx3
3662
3663   ret float %out
3664 }
3665
3666 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
3667 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
3668
3669 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
3670 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
3671 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
3672 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
3673 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
3674 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
3675
3676 declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3677 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3678 declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2
3679 declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2
3680 declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3
3681 declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3
3682
3683 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
3684 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3685 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3686 declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3687 declare void @llvm.amdgcn.kill(i1) #1
3688 declare float @llvm.amdgcn.wqm.f32(float) #3
3689 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
3690 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
3691 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
3692 declare float @llvm.amdgcn.wwm.f32(float) #3
3693 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
3694 declare float @llvm.amdgcn.strict.wqm.f32(float) #3
3695 declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
3696 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
3697 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
3698 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
3699 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
3700 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
3701 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3702 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
3703 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
3704 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
3705 declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
3706
3707 attributes #1 = { nounwind }
3708 attributes #2 = { nounwind readonly }
3709 attributes #3 = { nounwind readnone }
3710 attributes #4 = { nounwind readnone convergent }
3711 attributes #5 = { "amdgpu-ps-wqm-outputs" }
3712 attributes #6 = { nounwind "InitialPSInputAddr"="2" }
3713 attributes #7 = { nounwind readnone willreturn }