1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
5 ; Check that WQM isn't triggered by image load/store intrinsics.
6 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
7 ; GFX9-W64-LABEL: test1:
8 ; GFX9-W64: ; %bb.0: ; %main_body
9 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
10 ; GFX9-W64-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
11 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
12 ; GFX9-W64-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
13 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
14 ; GFX9-W64-NEXT: ; return to shader part epilog
16 ; GFX10-W32-LABEL: test1:
17 ; GFX10-W32: ; %bb.0: ; %main_body
18 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
19 ; GFX10-W32-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
20 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
21 ; GFX10-W32-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
22 ; GFX10-W32-NEXT: ; return to shader part epilog
24 %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
25 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
29 ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
30 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
31 ; GFX9-W64-LABEL: test2:
32 ; GFX9-W64: ; %bb.0: ; %main_body
33 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
34 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
35 ; GFX9-W64-NEXT: s_mov_b32 m0, s3
36 ; GFX9-W64-NEXT: s_nop 0
37 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
38 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
39 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
40 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
41 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
42 ; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
43 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
44 ; GFX9-W64-NEXT: ; return to shader part epilog
46 ; GFX10-W32-LABEL: test2:
47 ; GFX10-W32: ; %bb.0: ; %main_body
48 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
49 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
50 ; GFX10-W32-NEXT: s_mov_b32 m0, s3
51 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
52 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
53 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
54 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
55 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
56 ; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
57 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-W32-NEXT: ; return to shader part epilog
60 %inst23 = extractelement <2 x float> %pos, i32 0
61 %inst24 = extractelement <2 x float> %pos, i32 1
62 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
63 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
64 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
65 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
66 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
70 ; ... but disabled for stores (and, in this simple case, not re-enabled) ...
71 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
72 ; GFX9-W64-LABEL: test3:
73 ; GFX9-W64: ; %bb.0: ; %main_body
74 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
75 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
76 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
77 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
78 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
79 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
80 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
81 ; GFX9-W64-NEXT: ; return to shader part epilog
83 ; GFX10-W32-LABEL: test3:
84 ; GFX10-W32: ; %bb.0: ; %main_body
85 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
86 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
87 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
88 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
89 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
90 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
91 ; GFX10-W32-NEXT: ; return to shader part epilog
93 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
94 %tex.1 = bitcast <4 x float> %tex to <4 x i32>
95 %tex.2 = extractelement <4 x i32> %tex.1, i32 0
97 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
102 define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
103 ; GFX9-W64-LABEL: test3_ptr_buf:
104 ; GFX9-W64: ; %bb.0: ; %main_body
105 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
106 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
107 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
108 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
109 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
110 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
111 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
112 ; GFX9-W64-NEXT: ; return to shader part epilog
114 ; GFX10-W32-LABEL: test3_ptr_buf:
115 ; GFX10-W32: ; %bb.0: ; %main_body
116 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
117 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
118 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
119 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
120 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
121 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
122 ; GFX10-W32-NEXT: ; return to shader part epilog
124 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
125 %tex.1 = bitcast <4 x float> %tex to <4 x i32>
126 %tex.2 = extractelement <4 x i32> %tex.1, i32 0
128 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0)
133 ; ... and disabled for export.
134 define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
135 ; GFX9-W64-LABEL: test3x:
136 ; GFX9-W64: ; %bb.0: ; %main_body
137 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
138 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
139 ; GFX9-W64-NEXT: s_mov_b32 m0, s3
140 ; GFX9-W64-NEXT: s_nop 0
141 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
142 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
143 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
144 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
145 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
146 ; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
147 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
148 ; GFX9-W64-NEXT: exp mrt0 v0, v1, v2, v3 done vm
149 ; GFX9-W64-NEXT: s_endpgm
151 ; GFX10-W32-LABEL: test3x:
152 ; GFX10-W32: ; %bb.0: ; %main_body
153 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
154 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
155 ; GFX10-W32-NEXT: s_mov_b32 m0, s3
156 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
157 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
158 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
159 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
160 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
161 ; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
162 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
163 ; GFX10-W32-NEXT: exp mrt0 v0, v1, v2, v3 done vm
164 ; GFX10-W32-NEXT: s_endpgm
166 %inst23 = extractelement <2 x float> %pos, i32 0
167 %inst24 = extractelement <2 x float> %pos, i32 1
168 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
169 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
170 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
171 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
172 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
173 %tex.0 = extractelement <4 x float> %tex, i32 0
174 %tex.1 = extractelement <4 x float> %tex, i32 1
175 %tex.2 = extractelement <4 x float> %tex, i32 2
176 %tex.3 = extractelement <4 x float> %tex, i32 3
177 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
181 ; Check that WQM is re-enabled when required.
182 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
183 ; GFX9-W64-LABEL: test4:
184 ; GFX9-W64: ; %bb.0: ; %main_body
185 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
186 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
187 ; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1
188 ; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
189 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
190 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
191 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
192 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
193 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
194 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
195 ; GFX9-W64-NEXT: ; return to shader part epilog
197 ; GFX10-W32-LABEL: test4:
198 ; GFX10-W32: ; %bb.0: ; %main_body
199 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
200 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
201 ; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1
202 ; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
203 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
204 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
205 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
206 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
207 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
208 ; GFX10-W32-NEXT: ; return to shader part epilog
210 %c.1 = mul i32 %c, %d
212 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
213 %c.1.bc = bitcast i32 %c.1 to float
214 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
215 %tex0 = extractelement <4 x float> %tex, i32 0
216 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
217 ret <4 x float> %dtex
220 define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
221 ; GFX9-W64-LABEL: test4_ptr_buf:
222 ; GFX9-W64: ; %bb.0: ; %main_body
223 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
224 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
225 ; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1
226 ; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
227 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
228 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
229 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
230 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
231 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
232 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
233 ; GFX9-W64-NEXT: ; return to shader part epilog
235 ; GFX10-W32-LABEL: test4_ptr_buf:
236 ; GFX10-W32: ; %bb.0: ; %main_body
237 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
238 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
239 ; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1
240 ; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
241 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
242 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
243 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
244 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
245 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
246 ; GFX10-W32-NEXT: ; return to shader part epilog
248 %c.1 = mul i32 %c, %d
250 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0)
251 %c.1.bc = bitcast i32 %c.1 to float
252 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
253 %tex0 = extractelement <4 x float> %tex, i32 0
254 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
255 ret <4 x float> %dtex
258 ; Check that WQM is triggered by the wqm intrinsic.
259 ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
260 ; does not happen - the v_add should write the return reg directly.
261 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
262 ; GFX9-W64-LABEL: test5:
263 ; GFX9-W64: ; %bb.0: ; %main_body
264 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
265 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
266 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
267 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
268 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
269 ; GFX9-W64-NEXT: s_nop 0
270 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
271 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
272 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
273 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
274 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
275 ; GFX9-W64-NEXT: ; return to shader part epilog
277 ; GFX10-W32-LABEL: test5:
278 ; GFX10-W32: ; %bb.0: ; %main_body
279 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
280 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
281 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
282 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
283 ; GFX10-W32-NEXT: s_clause 0x1
284 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
285 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
286 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
287 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
288 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
289 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
290 ; GFX10-W32-NEXT: ; return to shader part epilog
292 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
293 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
294 %out = fadd float %src0, %src1
295 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
299 define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
300 ; GFX9-W64-LABEL: test5_ptr_buf:
301 ; GFX9-W64: ; %bb.0: ; %main_body
302 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
303 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
304 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
305 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
306 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
307 ; GFX9-W64-NEXT: s_nop 0
308 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
309 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
310 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
311 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
312 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
313 ; GFX9-W64-NEXT: ; return to shader part epilog
315 ; GFX10-W32-LABEL: test5_ptr_buf:
316 ; GFX10-W32: ; %bb.0: ; %main_body
317 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
318 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
319 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
320 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
321 ; GFX10-W32-NEXT: s_clause 0x1
322 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
323 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
324 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
325 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
326 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
327 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
328 ; GFX10-W32-NEXT: ; return to shader part epilog
330 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
331 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
332 %out = fadd float %src0, %src1
333 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
337 ; Check that the wqm intrinsic works correctly for integers.
338 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
339 ; GFX9-W64-LABEL: test6:
340 ; GFX9-W64: ; %bb.0: ; %main_body
341 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
342 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
343 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
344 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
345 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
346 ; GFX9-W64-NEXT: s_nop 0
347 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
348 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
349 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
350 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
351 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
352 ; GFX9-W64-NEXT: ; return to shader part epilog
354 ; GFX10-W32-LABEL: test6:
355 ; GFX10-W32: ; %bb.0: ; %main_body
356 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
357 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
358 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
359 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
360 ; GFX10-W32-NEXT: s_clause 0x1
361 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
362 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
363 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
364 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
365 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
366 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
367 ; GFX10-W32-NEXT: ; return to shader part epilog
369 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
370 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
371 %out = fadd float %src0, %src1
372 %out.0 = bitcast float %out to i32
373 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
374 %out.2 = bitcast i32 %out.1 to float
378 define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
379 ; GFX9-W64-LABEL: test6_ptr_buf:
380 ; GFX9-W64: ; %bb.0: ; %main_body
381 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
382 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
383 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
384 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
385 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
386 ; GFX9-W64-NEXT: s_nop 0
387 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
388 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
389 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
390 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
391 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
392 ; GFX9-W64-NEXT: ; return to shader part epilog
394 ; GFX10-W32-LABEL: test6_ptr_buf:
395 ; GFX10-W32: ; %bb.0: ; %main_body
396 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
397 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
398 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
399 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
400 ; GFX10-W32-NEXT: s_clause 0x1
401 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
402 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
403 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
404 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
405 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
406 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
407 ; GFX10-W32-NEXT: ; return to shader part epilog
409 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
410 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
411 %out = fadd float %src0, %src1
412 %out.0 = bitcast float %out to i32
413 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
414 %out.2 = bitcast i32 %out.1 to float
418 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
420 ; Check that WWM is triggered by the wwm intrinsic.
421 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
422 ; GFX9-W64-LABEL: test_wwm1:
423 ; GFX9-W64: ; %bb.0: ; %main_body
424 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
425 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
426 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
427 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
428 ; GFX9-W64-NEXT: s_nop 0
429 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
430 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
431 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
432 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
433 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
434 ; GFX9-W64-NEXT: ; return to shader part epilog
436 ; GFX10-W32-LABEL: test_wwm1:
437 ; GFX10-W32: ; %bb.0: ; %main_body
438 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
439 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
440 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
441 ; GFX10-W32-NEXT: s_clause 0x1
442 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
443 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
444 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
445 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
446 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
447 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
448 ; GFX10-W32-NEXT: ; return to shader part epilog
450 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
451 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
452 %out = fadd float %src0, %src1
453 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
457 ; Same as above, but with an integer type.
458 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
459 ; GFX9-W64-LABEL: test_wwm2:
460 ; GFX9-W64: ; %bb.0: ; %main_body
461 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
462 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
463 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
464 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
465 ; GFX9-W64-NEXT: s_nop 0
466 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
467 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
468 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
469 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
470 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
471 ; GFX9-W64-NEXT: ; return to shader part epilog
473 ; GFX10-W32-LABEL: test_wwm2:
474 ; GFX10-W32: ; %bb.0: ; %main_body
475 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
476 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
477 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
478 ; GFX10-W32-NEXT: s_clause 0x1
479 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
480 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
481 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
482 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
483 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
484 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
485 ; GFX10-W32-NEXT: ; return to shader part epilog
487 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
488 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
489 %src0.0 = bitcast float %src0 to i32
490 %src1.0 = bitcast float %src1 to i32
491 %out = add i32 %src0.0, %src1.0
492 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
493 %out.1 = bitcast i32 %out.0 to float
497 ; Check that we don't leave WWM on for computations that don't require WWM,
498 ; since that will lead clobbering things that aren't supposed to be clobbered
499 ; in cases like this.
500 ; We enforce this by checking that v_add gets emitted in the same block as
502 define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
503 ; GFX9-W64-LABEL: test_wwm3:
504 ; GFX9-W64: ; %bb.0: ; %main_body
505 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
506 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
507 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
508 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
509 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
510 ; GFX9-W64-NEXT: s_cbranch_execz .LBB13_2
511 ; GFX9-W64-NEXT: ; %bb.1: ; %if
512 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
513 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
514 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
515 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
516 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
517 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
518 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
519 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
520 ; GFX9-W64-NEXT: .LBB13_2: ; %endif
521 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
522 ; GFX9-W64-NEXT: ; return to shader part epilog
524 ; GFX10-W32-LABEL: test_wwm3:
525 ; GFX10-W32: ; %bb.0: ; %main_body
526 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
527 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
528 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
529 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
530 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
531 ; GFX10-W32-NEXT: s_cbranch_execz .LBB13_2
532 ; GFX10-W32-NEXT: ; %bb.1: ; %if
533 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
534 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
535 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
536 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
537 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
538 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
539 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
540 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
541 ; GFX10-W32-NEXT: .LBB13_2: ; %endif
542 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
543 ; GFX10-W32-NEXT: ; return to shader part epilog
545 ; use mbcnt to make sure the branch is divergent
546 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
547 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
548 %cc = icmp uge i32 %hi, 16
549 br i1 %cc, label %endif, label %if
552 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
553 %out = fadd float %src, %src
554 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
555 %out.1 = fadd float %src, %out.0
559 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
563 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
564 ; write could clobber disabled channels in the non-WWM one.
565 ; We enforce this by checking that v_mov gets emitted in the same block as
567 define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
568 ; GFX9-W64-LABEL: test_wwm4:
569 ; GFX9-W64: ; %bb.0: ; %main_body
570 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
571 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
572 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
573 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
574 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
575 ; GFX9-W64-NEXT: s_cbranch_execz .LBB14_2
576 ; GFX9-W64-NEXT: ; %bb.1: ; %if
577 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
578 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
579 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
580 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
581 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
582 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
583 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
584 ; GFX9-W64-NEXT: .LBB14_2: ; %endif
585 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
586 ; GFX9-W64-NEXT: ; return to shader part epilog
588 ; GFX10-W32-LABEL: test_wwm4:
589 ; GFX10-W32: ; %bb.0: ; %main_body
590 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
591 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
592 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
593 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
594 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
595 ; GFX10-W32-NEXT: s_cbranch_execz .LBB14_2
596 ; GFX10-W32-NEXT: ; %bb.1: ; %if
597 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
598 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
599 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
600 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
601 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
602 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
603 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
604 ; GFX10-W32-NEXT: .LBB14_2: ; %endif
605 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
606 ; GFX10-W32-NEXT: ; return to shader part epilog
608 ; use mbcnt to make sure the branch is divergent
609 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
610 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
611 %cc = icmp uge i32 %hi, 16
612 br i1 %cc, label %endif, label %if
615 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
616 %out = fadd float %src, %src
617 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
621 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
625 ; Make sure the transition from Exact to WWM then WQM works properly.
626 define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
627 ; GFX9-W64-LABEL: test_wwm5:
628 ; GFX9-W64: ; %bb.0: ; %main_body
629 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
630 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
631 ; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
632 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
633 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
634 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
635 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
636 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
637 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
638 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
639 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
640 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
641 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
642 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
643 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
644 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
645 ; GFX9-W64-NEXT: ; return to shader part epilog
647 ; GFX10-W32-LABEL: test_wwm5:
648 ; GFX10-W32: ; %bb.0: ; %main_body
649 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
650 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
651 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
652 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
653 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
654 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
655 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
656 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
657 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
658 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
659 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
660 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
661 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
662 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
663 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
664 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
665 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
666 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
667 ; GFX10-W32-NEXT: ; return to shader part epilog
669 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
670 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
671 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
672 %temp = fadd float %src1, %src1
673 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
674 %out = fadd float %temp.0, %temp.0
675 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
679 ; Check that WWM is turned on correctly across basic block boundaries.
680 ; if..then..endif version
681 ;SI-CHECK: buffer_load_dword
682 ;VI-CHECK: flat_load_dword
683 ;SI-CHECK: buffer_load_dword
684 ;VI-CHECK: flat_load_dword
685 define amdgpu_ps float @test_wwm6_then() {
686 ; GFX9-W64-LABEL: test_wwm6_then:
687 ; GFX9-W64: ; %bb.0: ; %main_body
688 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
689 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
690 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
691 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
692 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
693 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
694 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
695 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
696 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
697 ; GFX9-W64-NEXT: s_cbranch_execz .LBB16_2
698 ; GFX9-W64-NEXT: ; %bb.1: ; %if
699 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
700 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
701 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
702 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
703 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
704 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
705 ; GFX9-W64-NEXT: .LBB16_2: ; %endif
706 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
707 ; GFX9-W64-NEXT: ; return to shader part epilog
709 ; GFX10-W32-LABEL: test_wwm6_then:
710 ; GFX10-W32: ; %bb.0: ; %main_body
711 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
712 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
713 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
714 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
715 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
716 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
717 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
718 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
719 ; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
720 ; GFX10-W32-NEXT: s_cbranch_execz .LBB16_2
721 ; GFX10-W32-NEXT: ; %bb.1: ; %if
722 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
723 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
724 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
725 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
726 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
727 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
728 ; GFX10-W32-NEXT: .LBB16_2: ; %endif
729 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
730 ; GFX10-W32-NEXT: ; return to shader part epilog
732 %src0 = load volatile float, ptr addrspace(1) undef
733 ; use mbcnt to make sure the branch is divergent
734 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
735 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
736 %cc = icmp uge i32 %hi, 16
737 br i1 %cc, label %endif, label %if
740 %src1 = load volatile float, ptr addrspace(1) undef
741 %out = fadd float %src0, %src1
742 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
746 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
750 ; Check that WWM is turned on correctly across basic block boundaries.
752 ;SI-CHECK: buffer_load_dword
753 ;VI-CHECK: flat_load_dword
754 ;SI-CHECK: buffer_load_dword
755 ;VI-CHECK: flat_load_dword
756 define amdgpu_ps float @test_wwm6_loop() {
757 ; GFX9-W64-LABEL: test_wwm6_loop:
758 ; GFX9-W64: ; %bb.0: ; %main_body
759 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
760 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
761 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
762 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
763 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
764 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
765 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
766 ; GFX9-W64-NEXT: .LBB17_1: ; %loop
767 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
768 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
769 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
770 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
771 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
772 ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
773 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
774 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
775 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
776 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
777 ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
778 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
779 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
780 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB17_1
781 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop
782 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
783 ; GFX9-W64-NEXT: ; return to shader part epilog
785 ; GFX10-W32-LABEL: test_wwm6_loop:
786 ; GFX10-W32: ; %bb.0: ; %main_body
787 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
788 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
789 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
790 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
791 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
792 ; GFX10-W32-NEXT: s_mov_b32 s0, 0
793 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
794 ; GFX10-W32-NEXT: .LBB17_1: ; %loop
795 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
796 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
797 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
798 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
799 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
800 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
801 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
802 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
803 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
804 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
805 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
806 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
807 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
808 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB17_1
809 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop
810 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
811 ; GFX10-W32-NEXT: ; return to shader part epilog
813 %src0 = load volatile float, ptr addrspace(1) undef
814 ; use mbcnt to make sure the branch is divergent
815 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
816 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
820 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
821 %src1 = load volatile float, ptr addrspace(1) undef
822 %out = fadd float %src0, %src1
823 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
824 %counter.1 = sub i32 %counter, 1
825 %cc = icmp ne i32 %counter.1, 0
826 br i1 %cc, label %loop, label %endloop
832 ; Check that @llvm.amdgcn.set.inactive disables WWM.
833 define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
834 ; GFX9-W64-LABEL: test_wwm_set_inactive1:
835 ; GFX9-W64: ; %bb.0: ; %main_body
836 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
837 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
838 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
839 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
840 ; GFX9-W64-NEXT: s_not_b64 exec, exec
841 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
842 ; GFX9-W64-NEXT: s_not_b64 exec, exec
843 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
844 ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
845 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
846 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
847 ; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
848 ; GFX9-W64-NEXT: s_endpgm
850 ; GFX10-W32-LABEL: test_wwm_set_inactive1:
851 ; GFX10-W32: ; %bb.0: ; %main_body
852 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
853 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
854 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
855 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
856 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
857 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
858 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
859 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
860 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
861 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
862 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
863 ; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
864 ; GFX10-W32-NEXT: s_endpgm
866 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
867 %src.0 = bitcast float %src to i32
868 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
869 %out = add i32 %src.1, %src.1
870 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
871 %out.1 = bitcast i32 %out.0 to float
872 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
876 ; Check that Strict WQM is triggered by the strict_wqm intrinsic.
877 define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
878 ; GFX9-W64-LABEL: test_strict_wqm1:
879 ; GFX9-W64: ; %bb.0: ; %main_body
880 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
881 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
882 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
883 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
884 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
885 ; GFX9-W64-NEXT: s_nop 0
886 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
887 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
888 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
889 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
890 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
891 ; GFX9-W64-NEXT: ; return to shader part epilog
893 ; GFX10-W32-LABEL: test_strict_wqm1:
894 ; GFX10-W32: ; %bb.0: ; %main_body
895 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
896 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
897 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
898 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
899 ; GFX10-W32-NEXT: s_clause 0x1
900 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
901 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
902 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
903 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
904 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
905 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
906 ; GFX10-W32-NEXT: ; return to shader part epilog
908 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
909 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
910 %out = fadd float %src0, %src1
911 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
915 ; Same as above, but with an integer type.
916 define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
917 ; GFX9-W64-LABEL: test_strict_wqm2:
918 ; GFX9-W64: ; %bb.0: ; %main_body
919 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
920 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
921 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
922 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
923 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
924 ; GFX9-W64-NEXT: s_nop 0
925 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
926 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
927 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
928 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
929 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
930 ; GFX9-W64-NEXT: ; return to shader part epilog
932 ; GFX10-W32-LABEL: test_strict_wqm2:
933 ; GFX10-W32: ; %bb.0: ; %main_body
934 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
935 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
936 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
937 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
938 ; GFX10-W32-NEXT: s_clause 0x1
939 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
940 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
941 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
942 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
943 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
944 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
945 ; GFX10-W32-NEXT: ; return to shader part epilog
947 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
948 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
949 %src0.0 = bitcast float %src0 to i32
950 %src1.0 = bitcast float %src1 to i32
951 %out = add i32 %src0.0, %src1.0
952 %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
953 %out.1 = bitcast i32 %out.0 to float
957 ; Check that we don't leave Strict WQM on for computations that don't require it,
958 ; since that will lead clobbering things that aren't supposed to be clobbered
959 ; in cases like this.
960 ; We enforce this by checking that v_add gets emitted in the same block as
962 define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
963 ; GFX9-W64-LABEL: test_strict_wqm3:
964 ; GFX9-W64: ; %bb.0: ; %main_body
965 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
966 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
967 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
968 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
969 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
970 ; GFX9-W64-NEXT: s_cbranch_execz .LBB21_2
971 ; GFX9-W64-NEXT: ; %bb.1: ; %if
972 ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
973 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
974 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
975 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
976 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
977 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
978 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
979 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
980 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
981 ; GFX9-W64-NEXT: .LBB21_2: ; %endif
982 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
983 ; GFX9-W64-NEXT: ; return to shader part epilog
985 ; GFX10-W32-LABEL: test_strict_wqm3:
986 ; GFX10-W32: ; %bb.0: ; %main_body
987 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
988 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
989 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
990 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
991 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
992 ; GFX10-W32-NEXT: s_cbranch_execz .LBB21_2
993 ; GFX10-W32-NEXT: ; %bb.1: ; %if
994 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
995 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
996 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
997 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
998 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
999 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
1000 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
1001 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
1002 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
1003 ; GFX10-W32-NEXT: .LBB21_2: ; %endif
1004 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
1005 ; GFX10-W32-NEXT: ; return to shader part epilog
1007 ; use mbcnt to make sure the branch is divergent
1008 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1009 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1010 %cc = icmp uge i32 %hi, 16
1011 br i1 %cc, label %endif, label %if
1014 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1015 %out = fadd float %src, %src
1016 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1017 %out.1 = fadd float %src, %out.0
1021 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
1025 ; Check that Strict WQM writes aren't coalesced with non-strict writes, since
1026 ; the Strict WQM write could clobber disabled channels in the non-strict one.
1027 ; We enforce this by checking that v_mov gets emitted in the same block as
1029 define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
1030 ; GFX9-W64-LABEL: test_strict_wqm4:
1031 ; GFX9-W64: ; %bb.0: ; %main_body
1032 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1033 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1034 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
1035 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
1036 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
1037 ; GFX9-W64-NEXT: s_cbranch_execz .LBB22_2
1038 ; GFX9-W64-NEXT: ; %bb.1: ; %if
1039 ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
1040 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1041 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
1042 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1043 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1044 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
1045 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
1046 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
1047 ; GFX9-W64-NEXT: .LBB22_2: ; %endif
1048 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
1049 ; GFX9-W64-NEXT: ; return to shader part epilog
1051 ; GFX10-W32-LABEL: test_strict_wqm4:
1052 ; GFX10-W32: ; %bb.0: ; %main_body
1053 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1054 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1055 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
1056 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
1057 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
1058 ; GFX10-W32-NEXT: s_cbranch_execz .LBB22_2
1059 ; GFX10-W32-NEXT: ; %bb.1: ; %if
1060 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
1061 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1062 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
1063 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1064 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1065 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
1066 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
1067 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
1068 ; GFX10-W32-NEXT: .LBB22_2: ; %endif
1069 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
1070 ; GFX10-W32-NEXT: ; return to shader part epilog
1072 ; use mbcnt to make sure the branch is divergent
1073 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1074 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1075 %cc = icmp uge i32 %hi, 16
1076 br i1 %cc, label %endif, label %if
1079 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1080 %out = fadd float %src, %src
1081 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1085 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1089 ; Make sure the transition from Exact to Strict WQM then WQM works properly.
1090 define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
1091 ; GFX9-W64-LABEL: test_strict_wqm5:
1092 ; GFX9-W64: ; %bb.0: ; %main_body
1093 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1094 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
1095 ; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
1096 ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
1097 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1098 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1099 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1100 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
1101 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1102 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1103 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
1104 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
1105 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1106 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
1107 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
1108 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
1109 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
1110 ; GFX9-W64-NEXT: ; return to shader part epilog
1112 ; GFX10-W32-LABEL: test_strict_wqm5:
1113 ; GFX10-W32: ; %bb.0: ; %main_body
1114 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
1115 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
1116 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
1117 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1118 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1119 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
1120 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1121 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1122 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1123 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1124 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1125 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1126 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1127 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
1128 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1129 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1130 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
1131 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
1132 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
1133 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
1134 ; GFX10-W32-NEXT: ; return to shader part epilog
1136 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1137 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1138 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1139 %temp = fadd float %src1, %src1
1140 %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
1141 %out = fadd float %temp.0, %temp.0
1142 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
1146 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1147 ; if..then..endif version
1148 ;SI-CHECK: buffer_load_dword
1149 ;VI-CHECK: flat_load_dword
1150 ;SI-CHECK: buffer_load_dword
1151 ;VI-CHECK: flat_load_dword
1152 define amdgpu_ps float @test_strict_wqm6_then() {
1153 ; GFX9-W64-LABEL: test_strict_wqm6_then:
1154 ; GFX9-W64: ; %bb.0: ; %main_body
1155 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1156 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1157 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
1158 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1159 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
1160 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1161 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1162 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
1163 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
1164 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
1165 ; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2
1166 ; GFX9-W64-NEXT: ; %bb.1: ; %if
1167 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1168 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1169 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
1170 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1171 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
1172 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
1173 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
1174 ; GFX9-W64-NEXT: .LBB24_2: ; %endif
1175 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1176 ; GFX9-W64-NEXT: ; return to shader part epilog
1178 ; GFX10-W32-LABEL: test_strict_wqm6_then:
1179 ; GFX10-W32: ; %bb.0: ; %main_body
1180 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1181 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1182 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
1183 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1184 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1185 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1186 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1187 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
1188 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
1189 ; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
1190 ; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2
1191 ; GFX10-W32-NEXT: ; %bb.1: ; %if
1192 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
1193 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1194 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
1195 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1196 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
1197 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
1198 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
1199 ; GFX10-W32-NEXT: .LBB24_2: ; %endif
1200 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1201 ; GFX10-W32-NEXT: ; return to shader part epilog
1203 %src0 = load volatile float, ptr addrspace(1) undef
1204 ; use mbcnt to make sure the branch is divergent
1205 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1206 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1207 %cc = icmp uge i32 %hi, 16
1208 br i1 %cc, label %endif, label %if
1211 %src1 = load volatile float, ptr addrspace(1) undef
1212 %out = fadd float %src0, %src1
1213 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1217 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1221 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1223 ;SI-CHECK: buffer_load_dword
1224 ;VI-CHECK: flat_load_dword
1225 ;SI-CHECK: buffer_load_dword
1226 ;VI-CHECK: flat_load_dword
1227 define amdgpu_ps float @test_strict_wqm6_loop() {
1228 ; GFX9-W64-LABEL: test_strict_wqm6_loop:
1229 ; GFX9-W64: ; %bb.0: ; %main_body
1230 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1231 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1232 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
1233 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1234 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
1235 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1236 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
1237 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
1238 ; GFX9-W64-NEXT: .LBB25_1: ; %loop
1239 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
1240 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1241 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1242 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
1243 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1244 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
1245 ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
1246 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
1247 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1248 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1249 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
1250 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
1251 ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1252 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
1253 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
1254 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB25_1
1255 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop
1256 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1257 ; GFX9-W64-NEXT: ; return to shader part epilog
1259 ; GFX10-W32-LABEL: test_strict_wqm6_loop:
1260 ; GFX10-W32: ; %bb.0: ; %main_body
1261 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1262 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1263 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
1264 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1265 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1266 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1267 ; GFX10-W32-NEXT: s_mov_b32 s0, 0
1268 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
1269 ; GFX10-W32-NEXT: .LBB25_1: ; %loop
1270 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
1271 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
1272 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1273 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
1274 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1275 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
1276 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
1277 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
1278 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1279 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
1280 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
1281 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
1282 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
1283 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
1284 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
1285 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB25_1
1286 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop
1287 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1288 ; GFX10-W32-NEXT: ; return to shader part epilog
1290 %src0 = load volatile float, ptr addrspace(1) undef
1291 ; use mbcnt to make sure the branch is divergent
1292 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1293 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1297 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
1298 %src1 = load volatile float, ptr addrspace(1) undef
1299 %out = fadd float %src0, %src1
1300 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1301 %counter.1 = sub i32 %counter, 1
1302 %cc = icmp ne i32 %counter.1, 0
1303 br i1 %cc, label %loop, label %endloop
1309 ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
1310 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
1311 ; GFX9-W64-LABEL: test_set_inactive2:
1312 ; GFX9-W64: ; %bb.0: ; %main_body
1313 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1314 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1315 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
1316 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0
1317 ; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
1318 ; GFX9-W64-NEXT: s_nop 0
1319 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
1320 ; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
1321 ; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
1322 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
1323 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1324 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1
1325 ; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1326 ; GFX9-W64-NEXT: s_endpgm
1328 ; GFX10-W32-LABEL: test_set_inactive2:
1329 ; GFX10-W32: ; %bb.0: ; %main_body
1330 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
1331 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1332 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s1
1333 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
1334 ; GFX10-W32-NEXT: s_clause 0x1
1335 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
1336 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1337 ; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
1338 ; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
1339 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
1340 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1341 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
1342 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1343 ; GFX10-W32-NEXT: s_endpgm
1345 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1346 %src1.0 = bitcast float %src1 to i32
1347 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
1348 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1349 %src0.0 = bitcast float %src0 to i32
1350 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
1351 %out = add i32 %src0.1, %src1.1
1352 %out.0 = bitcast i32 %out to float
1353 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1357 ; Check a case of one branch of an if-else requiring WQM, the other requiring
1359 ; Note: In this particular case, the save-and-restore could be avoided if the
1360 ; analysis understood that the two branches of the if-else are mutually
1362 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1363 ; GFX9-W64-LABEL: test_control_flow_0:
1364 ; GFX9-W64: ; %bb.0: ; %main_body
1365 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1366 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1367 ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
1368 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1369 ; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
1370 ; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2
1371 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE
1372 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
1373 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1374 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1375 ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
1376 ; GFX9-W64-NEXT: .LBB27_2: ; %Flow
1377 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
1378 ; GFX9-W64-NEXT: s_cbranch_execz .LBB27_4
1379 ; GFX9-W64-NEXT: ; %bb.3: ; %IF
1380 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1381 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1382 ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1383 ; GFX9-W64-NEXT: .LBB27_4: ; %END
1384 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
1385 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1386 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1387 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
1388 ; GFX9-W64-NEXT: ; return to shader part epilog
1390 ; GFX10-W32-LABEL: test_control_flow_0:
1391 ; GFX10-W32: ; %bb.0: ; %main_body
1392 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1393 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1394 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
1395 ; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
1396 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
1397 ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2
1398 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
1399 ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
1400 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1401 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1402 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
1403 ; GFX10-W32-NEXT: .LBB27_2: ; %Flow
1404 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
1405 ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_4
1406 ; GFX10-W32-NEXT: ; %bb.3: ; %IF
1407 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1408 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1409 ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1410 ; GFX10-W32-NEXT: .LBB27_4: ; %END
1411 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
1412 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1413 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1414 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
1415 ; GFX10-W32-NEXT: ; return to shader part epilog
1417 %cmp = icmp eq i32 %z, 0
1418 br i1 %cmp, label %IF, label %ELSE
1421 %c.bc = bitcast i32 %c to float
1422 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1423 %tex0 = extractelement <4 x float> %tex, i32 0
1424 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1425 %data.if = extractelement <4 x float> %dtex, i32 0
1429 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1433 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1437 ; Reverse branch order compared to the previous test.
1438 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1439 ; GFX9-W64-LABEL: test_control_flow_1:
1440 ; GFX9-W64: ; %bb.0: ; %main_body
1441 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1442 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1443 ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
1444 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1445 ; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
1446 ; GFX9-W64-NEXT: s_cbranch_execz .LBB28_2
1447 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
1448 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1449 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1450 ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1451 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1452 ; GFX9-W64-NEXT: .LBB28_2: ; %Flow
1453 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15]
1454 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1455 ; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1]
1456 ; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1]
1457 ; GFX9-W64-NEXT: s_cbranch_execz .LBB28_4
1458 ; GFX9-W64-NEXT: ; %bb.3: ; %ELSE
1459 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1460 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1461 ; GFX9-W64-NEXT: .LBB28_4: ; %END
1462 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1463 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1464 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
1465 ; GFX9-W64-NEXT: ; return to shader part epilog
1467 ; GFX10-W32-LABEL: test_control_flow_1:
1468 ; GFX10-W32: ; %bb.0: ; %main_body
1469 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1470 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1471 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
1472 ; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
1473 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
1474 ; GFX10-W32-NEXT: s_cbranch_execz .LBB28_2
1475 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
1476 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1477 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1478 ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1479 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1480 ; GFX10-W32-NEXT: .LBB28_2: ; %Flow
1481 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13
1482 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1483 ; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0
1484 ; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
1485 ; GFX10-W32-NEXT: s_cbranch_execz .LBB28_4
1486 ; GFX10-W32-NEXT: ; %bb.3: ; %ELSE
1487 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1488 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1489 ; GFX10-W32-NEXT: .LBB28_4: ; %END
1490 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1491 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1492 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
1493 ; GFX10-W32-NEXT: ; return to shader part epilog
1495 %cmp = icmp eq i32 %z, 0
1496 br i1 %cmp, label %ELSE, label %IF
1499 %c.bc = bitcast i32 %c to float
1500 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1501 %tex0 = extractelement <4 x float> %tex, i32 0
1502 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1503 %data.if = extractelement <4 x float> %dtex, i32 0
1507 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1511 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1515 ; Check that branch conditions are properly marked as needing WQM...
1516 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
1517 ; GFX9-W64-LABEL: test_control_flow_2:
1518 ; GFX9-W64: ; %bb.0: ; %main_body
1519 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1520 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1521 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1522 ; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen
1523 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1524 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
1525 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1526 ; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen
1527 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1528 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
1529 ; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
1530 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1531 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1532 ; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
1533 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE
1534 ; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5
1535 ; GFX9-W64-NEXT: ; implicit-def: $vgpr5
1536 ; GFX9-W64-NEXT: ; %bb.2: ; %Flow
1537 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
1538 ; GFX9-W64-NEXT: ; %bb.3: ; %IF
1539 ; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3
1540 ; GFX9-W64-NEXT: ; %bb.4: ; %END
1541 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
1542 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1543 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1544 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1545 ; GFX9-W64-NEXT: ; return to shader part epilog
1547 ; GFX10-W32-LABEL: test_control_flow_2:
1548 ; GFX10-W32: ; %bb.0: ; %main_body
1549 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1550 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1551 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1552 ; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen
1553 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1554 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
1555 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1556 ; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0
1557 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1558 ; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen
1559 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1560 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1561 ; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
1562 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
1563 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
1564 ; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5
1565 ; GFX10-W32-NEXT: ; implicit-def: $vgpr5
1566 ; GFX10-W32-NEXT: ; %bb.2: ; %Flow
1567 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
1568 ; GFX10-W32-NEXT: ; %bb.3: ; %IF
1569 ; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3
1570 ; GFX10-W32-NEXT: ; %bb.4: ; %END
1571 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
1572 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1573 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1574 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1575 ; GFX10-W32-NEXT: ; return to shader part epilog
1577 %idx.1 = extractelement <3 x i32> %idx, i32 0
1578 %data.1 = extractelement <2 x float> %data, i32 0
1579 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1581 ; The load that determines the branch (and should therefore be WQM) is
1582 ; surrounded by stores that require disabled WQM.
1583 %idx.2 = extractelement <3 x i32> %idx, i32 1
1584 %z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0)
1586 %idx.3 = extractelement <3 x i32> %idx, i32 2
1587 %data.3 = extractelement <2 x float> %data, i32 1
1588 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0)
1590 %cc = fcmp ogt float %z, 0.0
1591 br i1 %cc, label %IF, label %ELSE
1594 %coord.IF = mul i32 %coord, 3
1598 %coord.ELSE = mul i32 %coord, 4
1602 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
1603 %coord.END.bc = bitcast i32 %coord.END to float
1604 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1605 ret <4 x float> %tex
1608 ; ... but only if they really do need it.
1609 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
1610 ; GFX9-W64-LABEL: test_control_flow_3:
1611 ; GFX9-W64: ; %bb.0: ; %main_body
1612 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1613 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1614 ; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1615 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1616 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1617 ; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1618 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1619 ; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
1620 ; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1621 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1622 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
1623 ; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1624 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_3
1625 ; GFX9-W64-NEXT: ; %bb.1: ; %Flow
1626 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
1627 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_4
1628 ; GFX9-W64-NEXT: .LBB30_2: ; %END
1629 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1630 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1631 ; GFX9-W64-NEXT: s_branch .LBB30_5
1632 ; GFX9-W64-NEXT: .LBB30_3: ; %ELSE
1633 ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1
1634 ; GFX9-W64-NEXT: ; implicit-def: $vgpr1
1635 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
1636 ; GFX9-W64-NEXT: s_cbranch_execz .LBB30_2
1637 ; GFX9-W64-NEXT: .LBB30_4: ; %IF
1638 ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
1639 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1640 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1641 ; GFX9-W64-NEXT: s_branch .LBB30_5
1642 ; GFX9-W64-NEXT: .LBB30_5:
1644 ; GFX10-W32-LABEL: test_control_flow_3:
1645 ; GFX10-W32: ; %bb.0: ; %main_body
1646 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1647 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1648 ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1649 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1650 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1651 ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1652 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1653 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1654 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1655 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1656 ; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1
1657 ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
1658 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_3
1659 ; GFX10-W32-NEXT: ; %bb.1: ; %Flow
1660 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
1661 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_4
1662 ; GFX10-W32-NEXT: .LBB30_2: ; %END
1663 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1664 ; GFX10-W32-NEXT: s_branch .LBB30_5
1665 ; GFX10-W32-NEXT: .LBB30_3: ; %ELSE
1666 ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1
1667 ; GFX10-W32-NEXT: ; implicit-def: $vgpr1
1668 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
1669 ; GFX10-W32-NEXT: s_cbranch_execz .LBB30_2
1670 ; GFX10-W32-NEXT: .LBB30_4: ; %IF
1671 ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
1672 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1673 ; GFX10-W32-NEXT: s_branch .LBB30_5
1674 ; GFX10-W32-NEXT: .LBB30_5:
1676 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1677 %tex0 = extractelement <4 x float> %tex, i32 0
1678 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1679 %dtex.1 = extractelement <4 x float> %dtex, i32 0
1680 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1682 %cc = fcmp ogt float %dtex.1, 0.0
1683 br i1 %cc, label %IF, label %ELSE
1686 %tex.IF = fmul float %dtex.1, 3.0
1690 %tex.ELSE = fmul float %dtex.1, 4.0
1694 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
1698 ; Another test that failed at some point because of terminator handling.
1699 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
1700 ; GFX9-W64-LABEL: test_control_flow_4:
1701 ; GFX9-W64: ; %bb.0: ; %main_body
1702 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1703 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1704 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1705 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1706 ; GFX9-W64-NEXT: s_cbranch_execz .LBB31_2
1707 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
1708 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
1709 ; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0
1710 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 1
1711 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1712 ; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen
1713 ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
1714 ; GFX9-W64-NEXT: .LBB31_2: ; %END
1715 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
1716 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1717 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1718 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1719 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1720 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1721 ; GFX9-W64-NEXT: ; return to shader part epilog
1723 ; GFX10-W32-LABEL: test_control_flow_4:
1724 ; GFX10-W32: ; %bb.0: ; %main_body
1725 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1726 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1727 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
1728 ; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
1729 ; GFX10-W32-NEXT: s_cbranch_execz .LBB31_2
1730 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
1731 ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
1732 ; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0
1733 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 1
1734 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1735 ; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen
1736 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
1737 ; GFX10-W32-NEXT: .LBB31_2: ; %END
1738 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
1739 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1740 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1741 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1742 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1743 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1744 ; GFX10-W32-NEXT: ; return to shader part epilog
1746 %cond = icmp eq i32 %y, 0
1747 br i1 %cond, label %IF, label %END
1750 %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1751 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
1755 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1756 %tex0 = extractelement <4 x float> %tex, i32 0
1757 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1758 ret <4 x float> %dtex
1761 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
1762 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
1763 ; GFX9-W64-LABEL: test_kill_0:
1764 ; GFX9-W64: ; %bb.0: ; %main_body
1765 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1766 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1767 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1768 ; GFX9-W64-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
1769 ; GFX9-W64-NEXT: s_nop 0
1770 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1771 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1772 ; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v6
1773 ; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc
1774 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB32_2
1775 ; GFX9-W64-NEXT: ; %bb.1: ; %main_body
1776 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
1777 ; GFX9-W64-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
1778 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1779 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1780 ; GFX9-W64-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
1781 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1782 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v7, v11
1783 ; GFX9-W64-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen
1784 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v8, v12
1785 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v9, v13
1786 ; GFX9-W64-NEXT: v_add_f32_e32 v3, v10, v14
1787 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1788 ; GFX9-W64-NEXT: s_branch .LBB32_3
1789 ; GFX9-W64-NEXT: .LBB32_2:
1790 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
1791 ; GFX9-W64-NEXT: exp null off, off, off, off done vm
1792 ; GFX9-W64-NEXT: s_endpgm
1793 ; GFX9-W64-NEXT: .LBB32_3:
1795 ; GFX10-W32-LABEL: test_kill_0:
1796 ; GFX10-W32: ; %bb.0: ; %main_body
1797 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1798 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1799 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1800 ; GFX10-W32-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1801 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1802 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1803 ; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v6
1804 ; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo
1805 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB32_2
1806 ; GFX10-W32-NEXT: ; %bb.1: ; %main_body
1807 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
1808 ; GFX10-W32-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1809 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1810 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1811 ; GFX10-W32-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1812 ; GFX10-W32-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen
1813 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1814 ; GFX10-W32-NEXT: v_add_f32_e32 v4, v8, v12
1815 ; GFX10-W32-NEXT: v_add_f32_e32 v5, v10, v14
1816 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v7, v11
1817 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v9, v13
1818 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v4
1819 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v5
1820 ; GFX10-W32-NEXT: s_branch .LBB32_3
1821 ; GFX10-W32-NEXT: .LBB32_2:
1822 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
1823 ; GFX10-W32-NEXT: exp null off, off, off, off done vm
1824 ; GFX10-W32-NEXT: s_endpgm
1825 ; GFX10-W32-NEXT: .LBB32_3:
1827 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1828 %idx.0 = extractelement <2 x i32> %idx, i32 0
1829 %data.0 = extractelement <2 x float> %data, i32 0
1830 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0)
1832 %z.cmp = fcmp olt float %z, 0.0
1833 call void @llvm.amdgcn.kill(i1 %z.cmp)
1835 %idx.1 = extractelement <2 x i32> %idx, i32 1
1836 %data.1 = extractelement <2 x float> %data, i32 1
1837 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1838 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1839 %tex2.0 = extractelement <4 x float> %tex2, i32 0
1840 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1841 %out = fadd <4 x float> %tex, %dtex
1843 ret <4 x float> %out
1846 ; ... but only if WQM is necessary.
1847 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
1848 ; GFX9-W64-LABEL: test_kill_1:
1849 ; GFX9-W64: ; %bb.0: ; %main_body
1850 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1851 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2
1852 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1853 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0
1854 ; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1855 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1856 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1857 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1858 ; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v4
1859 ; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc
1860 ; GFX9-W64-NEXT: buffer_store_dword v5, off, s[0:3], 0
1861 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB33_2
1862 ; GFX9-W64-NEXT: ; %bb.1: ; %main_body
1863 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
1864 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1865 ; GFX9-W64-NEXT: s_branch .LBB33_3
1866 ; GFX9-W64-NEXT: .LBB33_2:
1867 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
1868 ; GFX9-W64-NEXT: exp null off, off, off, off done vm
1869 ; GFX9-W64-NEXT: s_endpgm
1870 ; GFX9-W64-NEXT: .LBB33_3:
1872 ; GFX10-W32-LABEL: test_kill_1:
1873 ; GFX10-W32: ; %bb.0: ; %main_body
1874 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1875 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2
1876 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1877 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0
1878 ; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1879 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1880 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1881 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1882 ; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v4
1883 ; GFX10-W32-NEXT: buffer_store_dword v5, off, s[0:3], 0
1884 ; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo
1885 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB33_2
1886 ; GFX10-W32-NEXT: ; %bb.1: ; %main_body
1887 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
1888 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1889 ; GFX10-W32-NEXT: s_branch .LBB33_3
1890 ; GFX10-W32-NEXT: .LBB33_2:
1891 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
1892 ; GFX10-W32-NEXT: exp null off, off, off, off done vm
1893 ; GFX10-W32-NEXT: s_endpgm
1894 ; GFX10-W32-NEXT: .LBB33_3:
1896 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1897 %tex0 = extractelement <4 x float> %tex, i32 0
1898 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1900 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1902 %z.cmp = fcmp olt float %z, 0.0
1903 call void @llvm.amdgcn.kill(i1 %z.cmp)
1905 ret <4 x float> %dtex
1908 ; Check prolog shaders.
1909 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
1910 ; GFX9-W64-LABEL: test_prolog_1:
1911 ; GFX9-W64: ; %bb.0: ; %main_body
1912 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1913 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1914 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
1915 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
1916 ; GFX9-W64-NEXT: ; return to shader part epilog
1918 ; GFX10-W32-LABEL: test_prolog_1:
1919 ; GFX10-W32: ; %bb.0: ; %main_body
1920 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1921 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1922 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
1923 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
1924 ; GFX10-W32-NEXT: ; return to shader part epilog
1926 %s = fadd float %a, %b
1930 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
1931 ; GFX9-W64-LABEL: test_loop_vcc:
1932 ; GFX9-W64: ; %bb.0: ; %entry
1933 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1934 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1935 ; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3
1936 ; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2
1937 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1
1938 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
1939 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
1940 ; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm
1941 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1942 ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0
1943 ; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000
1944 ; GFX9-W64-NEXT: s_branch .LBB35_2
1945 ; GFX9-W64-NEXT: .LBB35_1: ; %body
1946 ; GFX9-W64-NEXT: ; in Loop: Header=BB35_2 Depth=1
1947 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
1948 ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8
1949 ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_4
1950 ; GFX9-W64-NEXT: .LBB35_2: ; %loop
1951 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
1952 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1953 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4
1954 ; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8
1955 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5
1956 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6
1957 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7
1958 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB35_1
1959 ; GFX9-W64-NEXT: ; %bb.3:
1960 ; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1961 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8
1962 ; GFX9-W64-NEXT: .LBB35_4: ; %break
1963 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
1964 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1965 ; GFX9-W64-NEXT: ; return to shader part epilog
1967 ; GFX10-W32-LABEL: test_loop_vcc:
1968 ; GFX10-W32: ; %bb.0: ; %entry
1969 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1970 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1971 ; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0
1972 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
1973 ; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
1974 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1975 ; GFX10-W32-NEXT: s_branch .LBB35_2
1976 ; GFX10-W32-NEXT: .p2align 6
1977 ; GFX10-W32-NEXT: .LBB35_1: ; %body
1978 ; GFX10-W32-NEXT: ; in Loop: Header=BB35_2 Depth=1
1979 ; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1980 ; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8
1981 ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_4
1982 ; GFX10-W32-NEXT: .LBB35_2: ; %loop
1983 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
1984 ; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1985 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1986 ; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3
1987 ; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2
1988 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1
1989 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
1990 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB35_1
1991 ; GFX10-W32-NEXT: ; %bb.3:
1992 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1993 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8
1994 ; GFX10-W32-NEXT: .LBB35_4: ; %break
1995 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
1996 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1997 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4
1998 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5
1999 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6
2000 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7
2001 ; GFX10-W32-NEXT: ; return to shader part epilog
2003 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
2007 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
2008 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
2009 %cc = fcmp ogt float %ctr.iv, 7.0
2010 br i1 %cc, label %break, label %body
2013 %c.iv0 = extractelement <4 x float> %c.iv, i32 0
2014 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2015 %ctr.next = fadd float %ctr.iv, 2.0
2019 ret <4 x float> %c.iv
2022 ; Only intrinsic stores need exact execution -- other stores do not have
2023 ; externally visible effects and may require WQM for correctness.
2024 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
2025 ; GFX9-W64-LABEL: test_alloca:
2026 ; GFX9-W64: ; %bb.0: ; %entry
2027 ; GFX9-W64-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2028 ; GFX9-W64-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2029 ; GFX9-W64-NEXT: s_mov_b32 s10, -1
2030 ; GFX9-W64-NEXT: s_mov_b32 s11, 0xe00000
2031 ; GFX9-W64-NEXT: s_add_u32 s8, s8, s0
2032 ; GFX9-W64-NEXT: s_addc_u32 s9, s9, 0
2033 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
2034 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2035 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
2036 ; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0
2037 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2038 ; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4
2039 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2040 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 4
2041 ; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1
2042 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
2043 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
2044 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2045 ; GFX9-W64-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf
2046 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1
2047 ; GFX9-W64-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen
2048 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
2049 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2050 ; GFX9-W64-NEXT: s_endpgm
2052 ; GFX10-W32-LABEL: test_alloca:
2053 ; GFX10-W32: ; %bb.0: ; %entry
2054 ; GFX10-W32-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2055 ; GFX10-W32-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2056 ; GFX10-W32-NEXT: s_mov_b32 s10, -1
2057 ; GFX10-W32-NEXT: s_mov_b32 s11, 0x31c16000
2058 ; GFX10-W32-NEXT: s_add_u32 s8, s8, s0
2059 ; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0
2060 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
2061 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2062 ; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 4
2063 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
2064 ; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0
2065 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2066 ; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4
2067 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
2068 ; GFX10-W32-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2069 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
2070 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2071 ; GFX10-W32-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2072 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1
2073 ; GFX10-W32-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen
2074 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2075 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2076 ; GFX10-W32-NEXT: s_endpgm
2078 %array = alloca [32 x i32], align 4, addrspace(5)
2080 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2082 store volatile i32 %a, ptr addrspace(5) %array, align 4
2084 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
2086 %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx
2087 %c = load i32, ptr addrspace(5) %c.gep, align 4
2088 %c.bc = bitcast i32 %c to float
2089 %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2090 call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2095 ; Must return to exact at the end of a non-void returning shader,
2096 ; otherwise the EXEC mask exported by the epilog will be wrong. This is true
2097 ; even if the shader has no kills, because a kill could have happened in a
2098 ; previous shader fragment.
2099 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
2100 ; GFX9-W64-LABEL: test_nonvoid_return:
2101 ; GFX9-W64: ; %bb.0:
2102 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
2103 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2104 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2105 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
2106 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2107 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2108 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2109 ; GFX9-W64-NEXT: ; return to shader part epilog
2111 ; GFX10-W32-LABEL: test_nonvoid_return:
2112 ; GFX10-W32: ; %bb.0:
2113 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
2114 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2115 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2116 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
2117 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2118 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2119 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2120 ; GFX10-W32-NEXT: ; return to shader part epilog
2121 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2122 %tex0 = extractelement <4 x float> %tex, i32 0
2123 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2124 ret <4 x float> %dtex
2127 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
2128 ; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
2129 ; GFX9-W64: ; %bb.0: ; %entry
2130 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2131 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2132 ; GFX9-W64-NEXT: s_and_b64 exec, exec, exec
2133 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2134 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2135 ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
2136 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB38_2
2137 ; GFX9-W64-NEXT: ; %bb.1: ; %else
2138 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2139 ; GFX9-W64-NEXT: s_branch .LBB38_3
2140 ; GFX9-W64-NEXT: .LBB38_2: ; %if
2141 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2142 ; GFX9-W64-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
2143 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2144 ; GFX9-W64-NEXT: .LBB38_3:
2146 ; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
2147 ; GFX10-W32: ; %bb.0: ; %entry
2148 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2149 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2150 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo
2151 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2152 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2153 ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
2154 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB38_2
2155 ; GFX10-W32-NEXT: ; %bb.1: ; %else
2156 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2157 ; GFX10-W32-NEXT: s_branch .LBB38_3
2158 ; GFX10-W32-NEXT: .LBB38_2: ; %if
2159 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2160 ; GFX10-W32-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
2161 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
2162 ; GFX10-W32-NEXT: .LBB38_3:
2164 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2165 %tex0 = extractelement <4 x float> %tex, i32 0
2166 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2167 %cc = icmp sgt i32 %c, 0
2168 br i1 %cc, label %if, label %else
2171 store volatile <4 x float> %dtex, ptr addrspace(1) undef
2175 ret <4 x float> %dtex
2178 ; Test awareness that s_wqm_b64 clobbers SCC.
2179 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
2180 ; GFX9-W64-LABEL: test_scc:
2181 ; GFX9-W64: ; %bb.0: ; %main_body
2182 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
2183 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
2184 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2185 ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
2186 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB39_2
2187 ; GFX9-W64-NEXT: ; %bb.1: ; %else
2188 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2189 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1
2190 ; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
2191 ; GFX9-W64-NEXT: s_cbranch_execz .LBB39_3
2192 ; GFX9-W64-NEXT: s_branch .LBB39_4
2193 ; GFX9-W64-NEXT: .LBB39_2:
2194 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2195 ; GFX9-W64-NEXT: .LBB39_3: ; %if
2196 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2197 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2198 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2199 ; GFX9-W64-NEXT: .LBB39_4: ; %end
2200 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
2201 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0
2202 ; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen
2203 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2204 ; GFX9-W64-NEXT: ; return to shader part epilog
2206 ; GFX10-W32-LABEL: test_scc:
2207 ; GFX10-W32: ; %bb.0: ; %main_body
2208 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
2209 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
2210 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2211 ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
2212 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB39_2
2213 ; GFX10-W32-NEXT: ; %bb.1: ; %else
2214 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2215 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1
2216 ; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
2217 ; GFX10-W32-NEXT: s_cbranch_execz .LBB39_3
2218 ; GFX10-W32-NEXT: s_branch .LBB39_4
2219 ; GFX10-W32-NEXT: .LBB39_2:
2220 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2221 ; GFX10-W32-NEXT: .LBB39_3: ; %if
2222 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2223 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2224 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2225 ; GFX10-W32-NEXT: .LBB39_4: ; %end
2226 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
2227 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0
2228 ; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen
2229 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2230 ; GFX10-W32-NEXT: ; return to shader part epilog
2232 %cc = icmp sgt i32 %sel, 0
2233 br i1 %cc, label %if, label %else
2236 %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2240 %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2244 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
2245 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2249 ; Check a case of a block being entirely WQM except for a bit of WWM.
2250 ; There was a bug where it forgot to enter and leave WWM.
2251 define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2252 ; GFX9-W64-LABEL: test_wwm_within_wqm:
2253 ; GFX9-W64: ; %bb.0: ; %main_body
2254 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2255 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2256 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2257 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
2258 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
2259 ; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2
2260 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
2261 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2262 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2263 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2264 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2265 ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
2266 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2267 ; GFX9-W64-NEXT: s_not_b64 exec, exec
2268 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
2269 ; GFX9-W64-NEXT: s_not_b64 exec, exec
2270 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2271 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2272 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2273 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2274 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2275 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0
2276 ; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF
2277 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
2278 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2279 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2280 ; GFX9-W64-NEXT: ; return to shader part epilog
2282 ; GFX10-W32-LABEL: test_wwm_within_wqm:
2283 ; GFX10-W32: ; %bb.0: ; %main_body
2284 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2285 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2286 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2287 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
2288 ; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
2289 ; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2
2290 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
2291 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2292 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2293 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2294 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2295 ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
2296 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2297 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
2298 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
2299 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
2300 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2301 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2302 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2303 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2304 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2305 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0
2306 ; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF
2307 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
2308 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2309 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2310 ; GFX10-W32-NEXT: ; return to shader part epilog
2312 %cmp = icmp eq i32 %z, 0
2313 br i1 %cmp, label %IF, label %ENDIF
2316 %c.bc = bitcast i32 %c to float
2317 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2318 %tex0 = extractelement <4 x float> %tex, i32 0
2319 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2320 %dataf = extractelement <4 x float> %dtex, i32 0
2321 %data1 = fptosi float %dataf to i32
2322 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2323 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2324 %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
2325 %data4f = sitofp i32 %data4 to float
2329 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2333 ; Check that WWM is triggered by the strict_wwm intrinsic.
2334 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
2335 ; GFX9-W64-LABEL: test_strict_wwm1:
2336 ; GFX9-W64: ; %bb.0: ; %main_body
2337 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2338 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2339 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
2340 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2341 ; GFX9-W64-NEXT: s_nop 0
2342 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2343 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2344 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
2345 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2346 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2347 ; GFX9-W64-NEXT: ; return to shader part epilog
2349 ; GFX10-W32-LABEL: test_strict_wwm1:
2350 ; GFX10-W32: ; %bb.0: ; %main_body
2351 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2352 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2353 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
2354 ; GFX10-W32-NEXT: s_clause 0x1
2355 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2356 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2357 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2358 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
2359 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2360 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2361 ; GFX10-W32-NEXT: ; return to shader part epilog
2363 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2364 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2365 %out = fadd float %src0, %src1
2366 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2370 ; Same as above, but with an integer type.
2371 define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
2372 ; GFX9-W64-LABEL: test_strict_wwm2:
2373 ; GFX9-W64: ; %bb.0: ; %main_body
2374 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2375 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2376 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
2377 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2378 ; GFX9-W64-NEXT: s_nop 0
2379 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2380 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2381 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
2382 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2383 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2384 ; GFX9-W64-NEXT: ; return to shader part epilog
2386 ; GFX10-W32-LABEL: test_strict_wwm2:
2387 ; GFX10-W32: ; %bb.0: ; %main_body
2388 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2389 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2390 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
2391 ; GFX10-W32-NEXT: s_clause 0x1
2392 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2393 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2394 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2395 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
2396 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2397 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2398 ; GFX10-W32-NEXT: ; return to shader part epilog
2400 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2401 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2402 %src0.0 = bitcast float %src0 to i32
2403 %src1.0 = bitcast float %src1 to i32
2404 %out = add i32 %src0.0, %src1.0
2405 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2406 %out.1 = bitcast i32 %out.0 to float
2410 ; Check that we don't leave WWM on for computations that don't require WWM,
2411 ; since that will lead clobbering things that aren't supposed to be clobbered
2412 ; in cases like this.
2413 ; We enforce this by checking that v_add gets emitted in the same block as
2415 define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
2416 ; GFX9-W64-LABEL: test_strict_wwm3:
2417 ; GFX9-W64: ; %bb.0: ; %main_body
2418 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2419 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2420 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
2421 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2422 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
2423 ; GFX9-W64-NEXT: s_cbranch_execz .LBB43_2
2424 ; GFX9-W64-NEXT: ; %bb.1: ; %if
2425 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
2426 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2427 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2428 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2429 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
2430 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
2431 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2432 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
2433 ; GFX9-W64-NEXT: .LBB43_2: ; %endif
2434 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
2435 ; GFX9-W64-NEXT: ; return to shader part epilog
2437 ; GFX10-W32-LABEL: test_strict_wwm3:
2438 ; GFX10-W32: ; %bb.0: ; %main_body
2439 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2440 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2441 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
2442 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2443 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
2444 ; GFX10-W32-NEXT: s_cbranch_execz .LBB43_2
2445 ; GFX10-W32-NEXT: ; %bb.1: ; %if
2446 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2447 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2448 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2449 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2450 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
2451 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2452 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2453 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
2454 ; GFX10-W32-NEXT: .LBB43_2: ; %endif
2455 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
2456 ; GFX10-W32-NEXT: ; return to shader part epilog
2458 ; use mbcnt to make sure the branch is divergent
2459 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2460 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2461 %cc = icmp uge i32 %hi, 16
2462 br i1 %cc, label %endif, label %if
2465 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2466 %out = fadd float %src, %src
2467 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2468 %out.1 = fadd float %src, %out.0
2472 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2476 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
2477 ; write could clobber disabled channels in the non-WWM one.
2478 ; We enforce this by checking that v_mov gets emitted in the same block as
2480 define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
2481 ; GFX9-W64-LABEL: test_strict_wwm4:
2482 ; GFX9-W64: ; %bb.0: ; %main_body
2483 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2484 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2485 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
2486 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2487 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
2488 ; GFX9-W64-NEXT: s_cbranch_execz .LBB44_2
2489 ; GFX9-W64-NEXT: ; %bb.1: ; %if
2490 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
2491 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2492 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2493 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2494 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
2495 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
2496 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2497 ; GFX9-W64-NEXT: .LBB44_2: ; %endif
2498 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
2499 ; GFX9-W64-NEXT: ; return to shader part epilog
2501 ; GFX10-W32-LABEL: test_strict_wwm4:
2502 ; GFX10-W32: ; %bb.0: ; %main_body
2503 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2504 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2505 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
2506 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2507 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
2508 ; GFX10-W32-NEXT: s_cbranch_execz .LBB44_2
2509 ; GFX10-W32-NEXT: ; %bb.1: ; %if
2510 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2511 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2512 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2513 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2514 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
2515 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2516 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2517 ; GFX10-W32-NEXT: .LBB44_2: ; %endif
2518 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
2519 ; GFX10-W32-NEXT: ; return to shader part epilog
2521 ; use mbcnt to make sure the branch is divergent
2522 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2523 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2524 %cc = icmp uge i32 %hi, 16
2525 br i1 %cc, label %endif, label %if
2528 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2529 %out = fadd float %src, %src
2530 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2534 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2538 ; Make sure the transition from Exact to WWM then WQM works properly.
2539 define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
2540 ; GFX9-W64-LABEL: test_strict_wwm5:
2541 ; GFX9-W64: ; %bb.0: ; %main_body
2542 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
2543 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
2544 ; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
2545 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2546 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
2547 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
2548 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
2549 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2550 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2551 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
2552 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
2553 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2554 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2555 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
2556 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2557 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
2558 ; GFX9-W64-NEXT: ; return to shader part epilog
2560 ; GFX10-W32-LABEL: test_strict_wwm5:
2561 ; GFX10-W32: ; %bb.0: ; %main_body
2562 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
2563 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
2564 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
2565 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2566 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
2567 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2568 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2569 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
2570 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2571 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2572 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2573 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
2574 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2575 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2576 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2577 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
2578 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2579 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
2580 ; GFX10-W32-NEXT: ; return to shader part epilog
2582 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2583 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2584 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2585 %temp = fadd float %src1, %src1
2586 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
2587 %out = fadd float %temp.0, %temp.0
2588 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
2592 ; Check that WWM is turned on correctly across basic block boundaries.
2593 ; if..then..endif version
2594 ;SI-CHECK: buffer_load_dword
2595 ;VI-CHECK: flat_load_dword
2596 ;SI-CHECK: buffer_load_dword
2597 ;VI-CHECK: flat_load_dword
2598 define amdgpu_ps float @test_strict_wwm6_then() {
2599 ; GFX9-W64-LABEL: test_strict_wwm6_then:
2600 ; GFX9-W64: ; %bb.0: ; %main_body
2601 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2602 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
2603 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2604 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2605 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2606 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2607 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
2608 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2609 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
2610 ; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2
2611 ; GFX9-W64-NEXT: ; %bb.1: ; %if
2612 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2613 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
2614 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2615 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
2616 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2617 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2618 ; GFX9-W64-NEXT: .LBB46_2: ; %endif
2619 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
2620 ; GFX9-W64-NEXT: ; return to shader part epilog
2622 ; GFX10-W32-LABEL: test_strict_wwm6_then:
2623 ; GFX10-W32: ; %bb.0: ; %main_body
2624 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2625 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
2626 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2627 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2628 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2629 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2630 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
2631 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2632 ; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
2633 ; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2
2634 ; GFX10-W32-NEXT: ; %bb.1: ; %if
2635 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
2636 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
2637 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2638 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
2639 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
2640 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2641 ; GFX10-W32-NEXT: .LBB46_2: ; %endif
2642 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
2643 ; GFX10-W32-NEXT: ; return to shader part epilog
2645 %src0 = load volatile float, ptr addrspace(1) undef
2646 ; use mbcnt to make sure the branch is divergent
2647 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2648 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2649 %cc = icmp uge i32 %hi, 16
2650 br i1 %cc, label %endif, label %if
2653 %src1 = load volatile float, ptr addrspace(1) undef
2654 %out = fadd float %src0, %src1
2655 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2659 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2663 ; Check that WWM is turned on correctly across basic block boundaries.
2665 define amdgpu_ps float @test_strict_wwm6_loop() {
2666 ; GFX9-W64-LABEL: test_strict_wwm6_loop:
2667 ; GFX9-W64: ; %bb.0: ; %main_body
2668 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2669 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
2670 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2671 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2672 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2673 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
2674 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
2675 ; GFX9-W64-NEXT: .LBB47_1: ; %loop
2676 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
2677 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2678 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
2679 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2680 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2681 ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
2682 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
2683 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2684 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
2685 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2686 ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2687 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2688 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
2689 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB47_1
2690 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop
2691 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
2692 ; GFX9-W64-NEXT: ; return to shader part epilog
2694 ; GFX10-W32-LABEL: test_strict_wwm6_loop:
2695 ; GFX10-W32: ; %bb.0: ; %main_body
2696 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2697 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
2698 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2699 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2700 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2701 ; GFX10-W32-NEXT: s_mov_b32 s0, 0
2702 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
2703 ; GFX10-W32-NEXT: .LBB47_1: ; %loop
2704 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
2705 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
2706 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
2707 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2708 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
2709 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
2710 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
2711 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
2712 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
2713 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
2714 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2715 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
2716 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
2717 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB47_1
2718 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop
2719 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
2720 ; GFX10-W32-NEXT: ; return to shader part epilog
2722 %src0 = load volatile float, ptr addrspace(1) undef
2723 ; use mbcnt to make sure the branch is divergent
2724 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2725 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2729 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
2730 %src1 = load volatile float, ptr addrspace(1) undef
2731 %out = fadd float %src0, %src1
2732 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2733 %counter.1 = sub i32 %counter, 1
2734 %cc = icmp ne i32 %counter.1, 0
2735 br i1 %cc, label %loop, label %endloop
2741 ; Check that @llvm.amdgcn.set.inactive disables WWM.
2742 define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
2743 ; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
2744 ; GFX9-W64: ; %bb.0: ; %main_body
2745 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2746 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
2747 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2748 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2749 ; GFX9-W64-NEXT: s_not_b64 exec, exec
2750 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2751 ; GFX9-W64-NEXT: s_not_b64 exec, exec
2752 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2753 ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
2754 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2755 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2756 ; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
2757 ; GFX9-W64-NEXT: s_endpgm
2759 ; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
2760 ; GFX10-W32: ; %bb.0: ; %main_body
2761 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2762 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
2763 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2764 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2765 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
2766 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2767 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
2768 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2769 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
2770 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2771 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2772 ; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
2773 ; GFX10-W32-NEXT: s_endpgm
2775 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2776 %src.0 = bitcast float %src to i32
2777 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
2778 %out = add i32 %src.1, %src.1
2779 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2780 %out.1 = bitcast i32 %out.0 to float
2781 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2785 ; Check a case of a block being entirely WQM except for a bit of WWM.
2786 ; There was a bug where it forgot to enter and leave WWM.
2787 define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2788 ; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
2789 ; GFX9-W64: ; %bb.0: ; %main_body
2790 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2791 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2792 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2793 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
2794 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
2795 ; GFX9-W64-NEXT: s_cbranch_execz .LBB49_2
2796 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
2797 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2798 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2799 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2800 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2801 ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
2802 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2803 ; GFX9-W64-NEXT: s_not_b64 exec, exec
2804 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
2805 ; GFX9-W64-NEXT: s_not_b64 exec, exec
2806 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2807 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2808 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2809 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2810 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2811 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0
2812 ; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF
2813 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
2814 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2815 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2816 ; GFX9-W64-NEXT: ; return to shader part epilog
2818 ; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
2819 ; GFX10-W32: ; %bb.0: ; %main_body
2820 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2821 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2822 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2823 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
2824 ; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
2825 ; GFX10-W32-NEXT: s_cbranch_execz .LBB49_2
2826 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
2827 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2828 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2829 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2830 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2831 ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
2832 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2833 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
2834 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
2835 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
2836 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2837 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2838 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2839 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2840 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2841 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0
2842 ; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF
2843 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
2844 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2845 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2846 ; GFX10-W32-NEXT: ; return to shader part epilog
2848 %cmp = icmp eq i32 %z, 0
2849 br i1 %cmp, label %IF, label %ENDIF
2852 %c.bc = bitcast i32 %c to float
2853 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2854 %tex0 = extractelement <4 x float> %tex, i32 0
2855 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2856 %dataf = extractelement <4 x float> %dtex, i32 0
2857 %data1 = fptosi float %dataf to i32
2858 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2859 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2860 %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
2861 %data4f = sitofp i32 %data4 to float
2865 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2869 ; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
2870 define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2871 ; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
2872 ; GFX9-W64: ; %bb.0: ; %main_body
2873 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2874 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2875 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2876 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2877 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2878 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
2879 ; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2
2880 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
2881 ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2882 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2883 ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2884 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2885 ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
2886 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2887 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2888 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2889 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
2890 ; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF
2891 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
2892 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2893 ; GFX9-W64-NEXT: ; return to shader part epilog
2895 ; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
2896 ; GFX10-W32: ; %bb.0: ; %main_body
2897 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2898 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2899 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2900 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2901 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
2902 ; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
2903 ; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2
2904 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
2905 ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2906 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2907 ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2908 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2909 ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
2910 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2911 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2912 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2913 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
2914 ; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF
2915 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
2916 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2917 ; GFX10-W32-NEXT: ; return to shader part epilog
2919 %cmp = icmp eq i32 %z, 0
2920 br i1 %cmp, label %IF, label %ENDIF
2923 %c.bc = bitcast i32 %c to float
2924 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2925 %tex0 = extractelement <4 x float> %tex, i32 0
2926 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2927 %dataf = extractelement <4 x float> %dtex, i32 0
2928 %data1 = fptosi float %dataf to i32
2929 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
2930 %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
2931 %data3f = sitofp i32 %data3 to float
2935 %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
2939 ;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
2940 define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) {
2941 ; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
2942 ; GFX9-W64: ; %bb.0: ; %main_body
2943 ; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec
2944 ; GFX9-W64-NEXT: s_mov_b32 s19, s17
2945 ; GFX9-W64-NEXT: s_mov_b64 s[30:31], exec
2946 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2947 ; GFX9-W64-NEXT: s_mov_b32 s23, s5
2948 ; GFX9-W64-NEXT: s_mov_b32 s22, s4
2949 ; GFX9-W64-NEXT: s_mov_b32 s21, s3
2950 ; GFX9-W64-NEXT: s_mov_b32 s20, s2
2951 ; GFX9-W64-NEXT: s_mov_b32 s27, s9
2952 ; GFX9-W64-NEXT: s_mov_b32 s26, s8
2953 ; GFX9-W64-NEXT: s_mov_b32 s25, s7
2954 ; GFX9-W64-NEXT: s_mov_b32 s24, s6
2955 ; GFX9-W64-NEXT: s_mov_b32 s18, s16
2956 ; GFX9-W64-NEXT: s_mov_b32 s17, s15
2957 ; GFX9-W64-NEXT: s_mov_b32 s16, s14
2958 ; GFX9-W64-NEXT: s_mov_b32 s15, s13
2959 ; GFX9-W64-NEXT: s_mov_b32 s14, s12
2960 ; GFX9-W64-NEXT: s_mov_b32 s13, s11
2961 ; GFX9-W64-NEXT: s_mov_b32 s12, s10
2962 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
2963 ; GFX9-W64-NEXT: s_mov_b64 exec, s[30:31]
2964 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
2965 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
2966 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2967 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen
2968 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2969 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2970 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, s0
2971 ; GFX9-W64-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen
2972 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2973 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
2974 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2975 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
2976 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2
2977 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2978 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2979 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2980 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
2981 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2982 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3
2983 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4
2984 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[28:29]
2985 ; GFX9-W64-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
2986 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2987 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
2988 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen
2989 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2990 ; GFX9-W64-NEXT: ; return to shader part epilog
2992 ; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
2993 ; GFX10-W32: ; %bb.0: ; %main_body
2994 ; GFX10-W32-NEXT: s_mov_b32 s28, exec_lo
2995 ; GFX10-W32-NEXT: s_mov_b32 s19, s17
2996 ; GFX10-W32-NEXT: s_mov_b32 s29, exec_lo
2997 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2998 ; GFX10-W32-NEXT: s_mov_b32 s23, s5
2999 ; GFX10-W32-NEXT: s_mov_b32 s22, s4
3000 ; GFX10-W32-NEXT: s_mov_b32 s21, s3
3001 ; GFX10-W32-NEXT: s_mov_b32 s20, s2
3002 ; GFX10-W32-NEXT: s_mov_b32 s27, s9
3003 ; GFX10-W32-NEXT: s_mov_b32 s26, s8
3004 ; GFX10-W32-NEXT: s_mov_b32 s25, s7
3005 ; GFX10-W32-NEXT: s_mov_b32 s24, s6
3006 ; GFX10-W32-NEXT: s_mov_b32 s18, s16
3007 ; GFX10-W32-NEXT: s_mov_b32 s17, s15
3008 ; GFX10-W32-NEXT: s_mov_b32 s16, s14
3009 ; GFX10-W32-NEXT: s_mov_b32 s15, s13
3010 ; GFX10-W32-NEXT: s_mov_b32 s14, s12
3011 ; GFX10-W32-NEXT: s_mov_b32 s13, s11
3012 ; GFX10-W32-NEXT: s_mov_b32 s12, s10
3013 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
3014 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s29
3015 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
3016 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
3017 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3018 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen
3019 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
3020 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
3021 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, s0
3022 ; GFX10-W32-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen
3023 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
3024 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3025 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3026 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3027 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2
3028 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3029 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3030 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
3031 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3032 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3
3033 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3034 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4
3035 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s28
3036 ; GFX10-W32-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
3037 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3038 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
3039 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen
3040 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3041 ; GFX10-W32-NEXT: ; return to shader part epilog
3043 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3044 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3045 %temp = fadd float %reload, %reload
3046 %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
3047 %temp3 = fadd float %temp2, %temp2
3048 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0)
3049 %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
3050 %temp5 = fadd float %temp3, %temp4
3051 %res.int = ptrtoint ptr addrspace(8) %res to i128
3052 %res.vec = bitcast i128 %res.int to <4 x i32>
3053 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3054 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3055 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3059 define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3060 ; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
3061 ; GFX9-W64: ; %bb.0: ; %main_body
3062 ; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec
3063 ; GFX9-W64-NEXT: s_mov_b32 s15, s13
3064 ; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec
3065 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3066 ; GFX9-W64-NEXT: s_mov_b32 s19, s5
3067 ; GFX9-W64-NEXT: s_mov_b32 s18, s4
3068 ; GFX9-W64-NEXT: s_mov_b32 s17, s3
3069 ; GFX9-W64-NEXT: s_mov_b32 s16, s2
3070 ; GFX9-W64-NEXT: s_mov_b32 s14, s12
3071 ; GFX9-W64-NEXT: s_mov_b32 s13, s11
3072 ; GFX9-W64-NEXT: s_mov_b32 s12, s10
3073 ; GFX9-W64-NEXT: s_mov_b32 s11, s9
3074 ; GFX9-W64-NEXT: s_mov_b32 s10, s8
3075 ; GFX9-W64-NEXT: s_mov_b32 s9, s7
3076 ; GFX9-W64-NEXT: s_mov_b32 s8, s6
3077 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
3078 ; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23]
3079 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3080 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
3081 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
3082 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen
3083 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
3084 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
3085 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3086 ; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen
3087 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
3088 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
3089 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3090 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2
3091 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
3092 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3093 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
3094 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
3095 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3096 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3
3097 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4
3098 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
3099 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3100 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3101 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3102 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3103 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3104 ; GFX9-W64-NEXT: ; return to shader part epilog
3106 ; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
3107 ; GFX10-W32: ; %bb.0: ; %main_body
3108 ; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo
3109 ; GFX10-W32-NEXT: s_mov_b32 s15, s13
3110 ; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo
3111 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3112 ; GFX10-W32-NEXT: s_mov_b32 s19, s5
3113 ; GFX10-W32-NEXT: s_mov_b32 s18, s4
3114 ; GFX10-W32-NEXT: s_mov_b32 s17, s3
3115 ; GFX10-W32-NEXT: s_mov_b32 s16, s2
3116 ; GFX10-W32-NEXT: s_mov_b32 s14, s12
3117 ; GFX10-W32-NEXT: s_mov_b32 s13, s11
3118 ; GFX10-W32-NEXT: s_mov_b32 s12, s10
3119 ; GFX10-W32-NEXT: s_mov_b32 s11, s9
3120 ; GFX10-W32-NEXT: s_mov_b32 s10, s8
3121 ; GFX10-W32-NEXT: s_mov_b32 s9, s7
3122 ; GFX10-W32-NEXT: s_mov_b32 s8, s6
3123 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
3124 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21
3125 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
3126 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
3127 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3128 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3129 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
3130 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen
3131 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3132 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3133 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3134 ; GFX10-W32-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen
3135 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3136 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
3137 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3138 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2
3139 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3140 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3141 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
3142 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3143 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3
3144 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3145 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4
3146 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
3147 ; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3148 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3149 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3150 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3151 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3152 ; GFX10-W32-NEXT: ; return to shader part epilog
3154 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3155 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3156 %temp = fadd float %reload, %reload
3157 %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
3158 %temp3 = fadd float %temp2, %temp2
3159 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3160 %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3161 %temp5 = fadd float %temp3, %temp4
3162 %res.int = ptrtoint ptr addrspace(8) %res to i128
3163 %res.vec = bitcast i128 %res.int to <4 x i32>
3164 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3165 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3166 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3170 ;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
3171 define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3172 ; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
3173 ; GFX9-W64: ; %bb.0: ; %main_body
3174 ; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec
3175 ; GFX9-W64-NEXT: s_mov_b32 s15, s13
3176 ; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec
3177 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3178 ; GFX9-W64-NEXT: s_mov_b32 s19, s5
3179 ; GFX9-W64-NEXT: s_mov_b32 s18, s4
3180 ; GFX9-W64-NEXT: s_mov_b32 s17, s3
3181 ; GFX9-W64-NEXT: s_mov_b32 s16, s2
3182 ; GFX9-W64-NEXT: s_mov_b32 s14, s12
3183 ; GFX9-W64-NEXT: s_mov_b32 s13, s11
3184 ; GFX9-W64-NEXT: s_mov_b32 s12, s10
3185 ; GFX9-W64-NEXT: s_mov_b32 s11, s9
3186 ; GFX9-W64-NEXT: s_mov_b32 s10, s8
3187 ; GFX9-W64-NEXT: s_mov_b32 s9, s7
3188 ; GFX9-W64-NEXT: s_mov_b32 s8, s6
3189 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
3190 ; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23]
3191 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3192 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3193 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
3194 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
3195 ; GFX9-W64-NEXT: s_nop 0
3196 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3197 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3198 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
3199 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3200 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3201 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
3202 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3203 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
3204 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3
3205 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
3206 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3207 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3208 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3209 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3210 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3211 ; GFX9-W64-NEXT: ; return to shader part epilog
3213 ; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
3214 ; GFX10-W32: ; %bb.0: ; %main_body
3215 ; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo
3216 ; GFX10-W32-NEXT: s_mov_b32 s15, s13
3217 ; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo
3218 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3219 ; GFX10-W32-NEXT: s_mov_b32 s19, s5
3220 ; GFX10-W32-NEXT: s_mov_b32 s18, s4
3221 ; GFX10-W32-NEXT: s_mov_b32 s17, s3
3222 ; GFX10-W32-NEXT: s_mov_b32 s16, s2
3223 ; GFX10-W32-NEXT: s_mov_b32 s14, s12
3224 ; GFX10-W32-NEXT: s_mov_b32 s13, s11
3225 ; GFX10-W32-NEXT: s_mov_b32 s12, s10
3226 ; GFX10-W32-NEXT: s_mov_b32 s11, s9
3227 ; GFX10-W32-NEXT: s_mov_b32 s10, s8
3228 ; GFX10-W32-NEXT: s_mov_b32 s9, s7
3229 ; GFX10-W32-NEXT: s_mov_b32 s8, s6
3230 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
3231 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21
3232 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3233 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, s1
3234 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
3235 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3236 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3237 ; GFX10-W32-NEXT: s_clause 0x1
3238 ; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
3239 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3240 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3241 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3242 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3243 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2
3244 ; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3245 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3246 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3247 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3
3248 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
3249 ; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3250 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3251 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3252 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3253 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3254 ; GFX10-W32-NEXT: ; return to shader part epilog
3256 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3257 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3258 %temp = fadd float %reload, %reload
3259 %res.int = ptrtoint ptr addrspace(8) %res to i128
3260 %res.vec = bitcast i128 %res.int to <4 x i32>
3261 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3262 %temp2 = fadd float %tex, %tex
3263 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3264 %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3265 %temp4 = fadd float %temp2, %temp3
3266 %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3267 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3268 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3272 ; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
3273 ; vector comparisons in Wave32 mode.
3274 define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) {
3275 ; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
3276 ; GFX9-W64: ; %bb.0: ; %main_body
3277 ; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac
3278 ; GFX9-W64-NEXT: s_mov_b32 s2, 32
3279 ; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000
3280 ; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
3281 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3282 ; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
3283 ; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
3284 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB54_1
3285 ; GFX9-W64-NEXT: s_endpgm
3286 ; GFX9-W64-NEXT: .LBB54_1:
3287 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
3288 ; GFX9-W64-NEXT: exp null off, off, off, off done vm
3289 ; GFX9-W64-NEXT: s_endpgm
3291 ; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
3292 ; GFX10-W32: ; %bb.0: ; %main_body
3293 ; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac
3294 ; GFX10-W32-NEXT: s_mov_b32 s2, 32
3295 ; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000
3296 ; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
3297 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3298 ; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
3299 ; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
3300 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB54_1
3301 ; GFX10-W32-NEXT: s_endpgm
3302 ; GFX10-W32-NEXT: .LBB54_1:
3303 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3304 ; GFX10-W32-NEXT: exp null off, off, off, off done vm
3305 ; GFX10-W32-NEXT: s_endpgm
3307 %1 = ptrtoint ptr addrspace(6) %0 to i32
3308 %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
3309 %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
3310 %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
3311 call void @llvm.amdgcn.kill(i1 %4) #1
3315 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
3316 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
3318 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
3319 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
3320 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
3321 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
3322 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
3323 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
3325 declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3326 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3327 declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2
3328 declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2
3329 declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3
3330 declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3
3332 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
3333 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3334 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3335 declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3336 declare void @llvm.amdgcn.kill(i1) #1
3337 declare float @llvm.amdgcn.wqm.f32(float) #3
3338 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
3339 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
3340 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
3341 declare float @llvm.amdgcn.wwm.f32(float) #3
3342 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
3343 declare float @llvm.amdgcn.strict.wqm.f32(float) #3
3344 declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
3345 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
3346 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
3347 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
3348 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
3349 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
3350 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3351 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
3352 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
3353 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
3355 attributes #1 = { nounwind }
3356 attributes #2 = { nounwind readonly }
3357 attributes #3 = { nounwind readnone }
3358 attributes #4 = { nounwind readnone convergent }
3359 attributes #5 = { "amdgpu-ps-wqm-outputs" }
3360 attributes #6 = { nounwind "InitialPSInputAddr"="2" }
3361 attributes #7 = { nounwind readnone willreturn }