1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s
5 ; Check that WQM isn't triggered by image load/store intrinsics.
6 define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
7 ; GFX9-W64-LABEL: test1:
8 ; GFX9-W64: ; %bb.0: ; %main_body
9 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
10 ; GFX9-W64-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
11 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
12 ; GFX9-W64-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
13 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
14 ; GFX9-W64-NEXT: ; return to shader part epilog
16 ; GFX10-W32-LABEL: test1:
17 ; GFX10-W32: ; %bb.0: ; %main_body
18 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
19 ; GFX10-W32-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
20 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
21 ; GFX10-W32-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
22 ; GFX10-W32-NEXT: ; return to shader part epilog
24 %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
25 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
29 ; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
30 define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
31 ; GFX9-W64-LABEL: test2:
32 ; GFX9-W64: ; %bb.0: ; %main_body
33 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
34 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
35 ; GFX9-W64-NEXT: s_mov_b32 m0, s3
36 ; GFX9-W64-NEXT: s_nop 0
37 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
38 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
39 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
40 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
41 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
42 ; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
43 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
44 ; GFX9-W64-NEXT: ; return to shader part epilog
46 ; GFX10-W32-LABEL: test2:
47 ; GFX10-W32: ; %bb.0: ; %main_body
48 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
49 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
50 ; GFX10-W32-NEXT: s_mov_b32 m0, s3
51 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
52 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
53 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
54 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
55 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
56 ; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
57 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-W32-NEXT: ; return to shader part epilog
60 %inst23 = extractelement <2 x float> %pos, i32 0
61 %inst24 = extractelement <2 x float> %pos, i32 1
62 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
63 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
64 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
65 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
66 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
70 ; ... but disabled for stores (and, in this simple case, not re-enabled) ...
71 define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
72 ; GFX9-W64-LABEL: test3:
73 ; GFX9-W64: ; %bb.0: ; %main_body
74 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
75 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
76 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
77 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
78 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
79 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
80 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
81 ; GFX9-W64-NEXT: ; return to shader part epilog
83 ; GFX10-W32-LABEL: test3:
84 ; GFX10-W32: ; %bb.0: ; %main_body
85 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
86 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
87 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
88 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
89 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
90 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
91 ; GFX10-W32-NEXT: ; return to shader part epilog
93 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
94 %tex.1 = bitcast <4 x float> %tex to <4 x i32>
95 %tex.2 = extractelement <4 x i32> %tex.1, i32 0
97 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
102 define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
103 ; GFX9-W64-LABEL: test3_ptr_buf:
104 ; GFX9-W64: ; %bb.0: ; %main_body
105 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
106 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
107 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
108 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
109 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
110 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
111 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
112 ; GFX9-W64-NEXT: ; return to shader part epilog
114 ; GFX10-W32-LABEL: test3_ptr_buf:
115 ; GFX10-W32: ; %bb.0: ; %main_body
116 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
117 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
118 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
119 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
120 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
121 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
122 ; GFX10-W32-NEXT: ; return to shader part epilog
124 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
125 %tex.1 = bitcast <4 x float> %tex to <4 x i32>
126 %tex.2 = extractelement <4 x i32> %tex.1, i32 0
128 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0)
133 ; ... and disabled for export.
134 define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
135 ; GFX9-W64-LABEL: test3x:
136 ; GFX9-W64: ; %bb.0: ; %main_body
137 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
138 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
139 ; GFX9-W64-NEXT: s_mov_b32 m0, s3
140 ; GFX9-W64-NEXT: s_nop 0
141 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
142 ; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
143 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
144 ; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
145 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
146 ; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
147 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
148 ; GFX9-W64-NEXT: exp mrt0 v0, v1, v2, v3 done vm
149 ; GFX9-W64-NEXT: s_endpgm
151 ; GFX10-W32-LABEL: test3x:
152 ; GFX10-W32: ; %bb.0: ; %main_body
153 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
154 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
155 ; GFX10-W32-NEXT: s_mov_b32 m0, s3
156 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
157 ; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
158 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
159 ; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
160 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
161 ; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
162 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
163 ; GFX10-W32-NEXT: exp mrt0 v0, v1, v2, v3 done vm
164 ; GFX10-W32-NEXT: s_endpgm
166 %inst23 = extractelement <2 x float> %pos, i32 0
167 %inst24 = extractelement <2 x float> %pos, i32 1
168 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
169 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
170 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
171 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
172 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
173 %tex.0 = extractelement <4 x float> %tex, i32 0
174 %tex.1 = extractelement <4 x float> %tex, i32 1
175 %tex.2 = extractelement <4 x float> %tex, i32 2
176 %tex.3 = extractelement <4 x float> %tex, i32 3
177 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
181 ; Check that WQM is re-enabled when required.
182 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
183 ; GFX9-W64-LABEL: test4:
184 ; GFX9-W64: ; %bb.0: ; %main_body
185 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
186 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
187 ; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1
188 ; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
189 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
190 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
191 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
192 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
193 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
194 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
195 ; GFX9-W64-NEXT: ; return to shader part epilog
197 ; GFX10-W32-LABEL: test4:
198 ; GFX10-W32: ; %bb.0: ; %main_body
199 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
200 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
201 ; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1
202 ; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
203 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
204 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
205 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
206 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
207 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
208 ; GFX10-W32-NEXT: ; return to shader part epilog
210 %c.1 = mul i32 %c, %d
212 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
213 %c.1.bc = bitcast i32 %c.1 to float
214 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
215 %tex0 = extractelement <4 x float> %tex, i32 0
216 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
217 ret <4 x float> %dtex
220 define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
221 ; GFX9-W64-LABEL: test4_ptr_buf:
222 ; GFX9-W64: ; %bb.0: ; %main_body
223 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
224 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
225 ; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1
226 ; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
227 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
228 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
229 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
230 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
231 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
232 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
233 ; GFX9-W64-NEXT: ; return to shader part epilog
235 ; GFX10-W32-LABEL: test4_ptr_buf:
236 ; GFX10-W32: ; %bb.0: ; %main_body
237 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
238 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
239 ; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1
240 ; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
241 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
242 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
243 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
244 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
245 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
246 ; GFX10-W32-NEXT: ; return to shader part epilog
248 %c.1 = mul i32 %c, %d
250 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0)
251 %c.1.bc = bitcast i32 %c.1 to float
252 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
253 %tex0 = extractelement <4 x float> %tex, i32 0
254 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
255 ret <4 x float> %dtex
258 ; Check that WQM is triggered by the wqm intrinsic.
259 ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
260 ; does not happen - the v_add should write the return reg directly.
261 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
262 ; GFX9-W64-LABEL: test5:
263 ; GFX9-W64: ; %bb.0: ; %main_body
264 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
265 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
266 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
267 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
268 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
269 ; GFX9-W64-NEXT: s_nop 0
270 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
271 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
272 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
273 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
274 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
275 ; GFX9-W64-NEXT: ; return to shader part epilog
277 ; GFX10-W32-LABEL: test5:
278 ; GFX10-W32: ; %bb.0: ; %main_body
279 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
280 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
281 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
282 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
283 ; GFX10-W32-NEXT: s_clause 0x1
284 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
285 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
286 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
287 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
288 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
289 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
290 ; GFX10-W32-NEXT: ; return to shader part epilog
292 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
293 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
294 %out = fadd float %src0, %src1
295 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
299 define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
300 ; GFX9-W64-LABEL: test5_ptr_buf:
301 ; GFX9-W64: ; %bb.0: ; %main_body
302 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
303 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
304 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
305 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
306 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
307 ; GFX9-W64-NEXT: s_nop 0
308 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
309 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
310 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
311 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
312 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
313 ; GFX9-W64-NEXT: ; return to shader part epilog
315 ; GFX10-W32-LABEL: test5_ptr_buf:
316 ; GFX10-W32: ; %bb.0: ; %main_body
317 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
318 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
319 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
320 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
321 ; GFX10-W32-NEXT: s_clause 0x1
322 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
323 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
324 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
325 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
326 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
327 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
328 ; GFX10-W32-NEXT: ; return to shader part epilog
330 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
331 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
332 %out = fadd float %src0, %src1
333 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
337 ; Check that the wqm intrinsic works correctly for integers.
338 define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
339 ; GFX9-W64-LABEL: test6:
340 ; GFX9-W64: ; %bb.0: ; %main_body
341 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
342 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
343 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
344 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
345 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
346 ; GFX9-W64-NEXT: s_nop 0
347 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
348 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
349 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
350 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
351 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
352 ; GFX9-W64-NEXT: ; return to shader part epilog
354 ; GFX10-W32-LABEL: test6:
355 ; GFX10-W32: ; %bb.0: ; %main_body
356 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
357 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
358 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
359 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
360 ; GFX10-W32-NEXT: s_clause 0x1
361 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
362 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
363 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
364 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
365 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
366 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
367 ; GFX10-W32-NEXT: ; return to shader part epilog
369 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
370 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
371 %out = fadd float %src0, %src1
372 %out.0 = bitcast float %out to i32
373 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
374 %out.2 = bitcast i32 %out.1 to float
378 define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
379 ; GFX9-W64-LABEL: test6_ptr_buf:
380 ; GFX9-W64: ; %bb.0: ; %main_body
381 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
382 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
383 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
384 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
385 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
386 ; GFX9-W64-NEXT: s_nop 0
387 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
388 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
389 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
390 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
391 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
392 ; GFX9-W64-NEXT: ; return to shader part epilog
394 ; GFX10-W32-LABEL: test6_ptr_buf:
395 ; GFX10-W32: ; %bb.0: ; %main_body
396 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
397 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
398 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
399 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
400 ; GFX10-W32-NEXT: s_clause 0x1
401 ; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
402 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
403 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
404 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
405 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
406 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
407 ; GFX10-W32-NEXT: ; return to shader part epilog
409 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
410 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
411 %out = fadd float %src0, %src1
412 %out.0 = bitcast float %out to i32
413 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
414 %out.2 = bitcast i32 %out.1 to float
418 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
420 ; Check that WWM is triggered by the wwm intrinsic.
421 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
422 ; GFX9-W64-LABEL: test_wwm1:
423 ; GFX9-W64: ; %bb.0: ; %main_body
424 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
425 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
426 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
427 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
428 ; GFX9-W64-NEXT: s_nop 0
429 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
430 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
431 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
432 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
433 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
434 ; GFX9-W64-NEXT: ; return to shader part epilog
436 ; GFX10-W32-LABEL: test_wwm1:
437 ; GFX10-W32: ; %bb.0: ; %main_body
438 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
439 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
440 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
441 ; GFX10-W32-NEXT: s_clause 0x1
442 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
443 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
444 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
445 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
446 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
447 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
448 ; GFX10-W32-NEXT: ; return to shader part epilog
450 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
451 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
452 %out = fadd float %src0, %src1
453 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
457 ; Same as above, but with an integer type.
458 define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
459 ; GFX9-W64-LABEL: test_wwm2:
460 ; GFX9-W64: ; %bb.0: ; %main_body
461 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
462 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
463 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
464 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
465 ; GFX9-W64-NEXT: s_nop 0
466 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
467 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
468 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
469 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
470 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
471 ; GFX9-W64-NEXT: ; return to shader part epilog
473 ; GFX10-W32-LABEL: test_wwm2:
474 ; GFX10-W32: ; %bb.0: ; %main_body
475 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
476 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
477 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
478 ; GFX10-W32-NEXT: s_clause 0x1
479 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
480 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
481 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
482 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
483 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
484 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
485 ; GFX10-W32-NEXT: ; return to shader part epilog
487 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
488 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
489 %src0.0 = bitcast float %src0 to i32
490 %src1.0 = bitcast float %src1 to i32
491 %out = add i32 %src0.0, %src1.0
492 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
493 %out.1 = bitcast i32 %out.0 to float
497 ; Check that we don't leave WWM on for computations that don't require WWM,
498 ; since that will lead clobbering things that aren't supposed to be clobbered
499 ; in cases like this.
500 ; We enforce this by checking that v_add gets emitted in the same block as
502 define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
503 ; GFX9-W64-LABEL: test_wwm3:
504 ; GFX9-W64: ; %bb.0: ; %main_body
505 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
506 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
507 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
508 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
509 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
510 ; GFX9-W64-NEXT: s_cbranch_execz .LBB13_2
511 ; GFX9-W64-NEXT: ; %bb.1: ; %if
512 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
513 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
514 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
515 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
516 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
517 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
518 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
519 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
520 ; GFX9-W64-NEXT: .LBB13_2: ; %endif
521 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
522 ; GFX9-W64-NEXT: ; return to shader part epilog
524 ; GFX10-W32-LABEL: test_wwm3:
525 ; GFX10-W32: ; %bb.0: ; %main_body
526 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
527 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
528 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
529 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
530 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
531 ; GFX10-W32-NEXT: s_cbranch_execz .LBB13_2
532 ; GFX10-W32-NEXT: ; %bb.1: ; %if
533 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
534 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
535 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
536 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
537 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
538 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
539 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
540 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
541 ; GFX10-W32-NEXT: .LBB13_2: ; %endif
542 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
543 ; GFX10-W32-NEXT: ; return to shader part epilog
545 ; use mbcnt to make sure the branch is divergent
546 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
547 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
548 %cc = icmp uge i32 %hi, 16
549 br i1 %cc, label %endif, label %if
552 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
553 %out = fadd float %src, %src
554 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
555 %out.1 = fadd float %src, %out.0
559 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
563 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
564 ; write could clobber disabled channels in the non-WWM one.
565 ; We enforce this by checking that v_mov gets emitted in the same block as
567 define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
568 ; GFX9-W64-LABEL: test_wwm4:
569 ; GFX9-W64: ; %bb.0: ; %main_body
570 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
571 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
572 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
573 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
574 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
575 ; GFX9-W64-NEXT: s_cbranch_execz .LBB14_2
576 ; GFX9-W64-NEXT: ; %bb.1: ; %if
577 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
578 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
579 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
580 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
581 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
582 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
583 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
584 ; GFX9-W64-NEXT: .LBB14_2: ; %endif
585 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
586 ; GFX9-W64-NEXT: ; return to shader part epilog
588 ; GFX10-W32-LABEL: test_wwm4:
589 ; GFX10-W32: ; %bb.0: ; %main_body
590 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
591 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
592 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
593 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
594 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
595 ; GFX10-W32-NEXT: s_cbranch_execz .LBB14_2
596 ; GFX10-W32-NEXT: ; %bb.1: ; %if
597 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
598 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
599 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
600 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
601 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
602 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
603 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
604 ; GFX10-W32-NEXT: .LBB14_2: ; %endif
605 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
606 ; GFX10-W32-NEXT: ; return to shader part epilog
608 ; use mbcnt to make sure the branch is divergent
609 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
610 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
611 %cc = icmp uge i32 %hi, 16
612 br i1 %cc, label %endif, label %if
615 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
616 %out = fadd float %src, %src
617 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
621 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
625 ; Make sure the transition from Exact to WWM then WQM works properly.
626 define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
627 ; GFX9-W64-LABEL: test_wwm5:
628 ; GFX9-W64: ; %bb.0: ; %main_body
629 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
630 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
631 ; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
632 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
633 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
634 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
635 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
636 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
637 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
638 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
639 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
640 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
641 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
642 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
643 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
644 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
645 ; GFX9-W64-NEXT: ; return to shader part epilog
647 ; GFX10-W32-LABEL: test_wwm5:
648 ; GFX10-W32: ; %bb.0: ; %main_body
649 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
650 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
651 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
652 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
653 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
654 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
655 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
656 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
657 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
658 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
659 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
660 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
661 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
662 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
663 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
664 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
665 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
666 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
667 ; GFX10-W32-NEXT: ; return to shader part epilog
669 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
670 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
671 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
672 %temp = fadd float %src1, %src1
673 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
674 %out = fadd float %temp.0, %temp.0
675 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
679 ; Check that WWM is turned on correctly across basic block boundaries.
680 ; if..then..endif version
681 ;SI-CHECK: buffer_load_dword
682 ;VI-CHECK: flat_load_dword
683 ;SI-CHECK: buffer_load_dword
684 ;VI-CHECK: flat_load_dword
685 define amdgpu_ps float @test_wwm6_then() {
686 ; GFX9-W64-LABEL: test_wwm6_then:
687 ; GFX9-W64: ; %bb.0: ; %main_body
688 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
689 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
690 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
691 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
692 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
693 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
694 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
695 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
696 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
697 ; GFX9-W64-NEXT: s_cbranch_execz .LBB16_2
698 ; GFX9-W64-NEXT: ; %bb.1: ; %if
699 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
700 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
701 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
702 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
703 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
704 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
705 ; GFX9-W64-NEXT: .LBB16_2: ; %endif
706 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
707 ; GFX9-W64-NEXT: ; return to shader part epilog
709 ; GFX10-W32-LABEL: test_wwm6_then:
710 ; GFX10-W32: ; %bb.0: ; %main_body
711 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
712 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
713 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
714 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
715 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
716 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
717 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
718 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
719 ; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
720 ; GFX10-W32-NEXT: s_cbranch_execz .LBB16_2
721 ; GFX10-W32-NEXT: ; %bb.1: ; %if
722 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
723 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
724 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
725 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
726 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
727 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
728 ; GFX10-W32-NEXT: .LBB16_2: ; %endif
729 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
730 ; GFX10-W32-NEXT: ; return to shader part epilog
732 %src0 = load volatile float, ptr addrspace(1) undef
733 ; use mbcnt to make sure the branch is divergent
734 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
735 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
736 %cc = icmp uge i32 %hi, 16
737 br i1 %cc, label %endif, label %if
740 %src1 = load volatile float, ptr addrspace(1) undef
741 %out = fadd float %src0, %src1
742 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
746 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
750 ; Check that WWM is turned on correctly across basic block boundaries.
752 ;SI-CHECK: buffer_load_dword
753 ;VI-CHECK: flat_load_dword
754 ;SI-CHECK: buffer_load_dword
755 ;VI-CHECK: flat_load_dword
756 define amdgpu_ps float @test_wwm6_loop() {
757 ; GFX9-W64-LABEL: test_wwm6_loop:
758 ; GFX9-W64: ; %bb.0: ; %main_body
759 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
760 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
761 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
762 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
763 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
764 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
765 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
766 ; GFX9-W64-NEXT: .LBB17_1: ; %loop
767 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
768 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
769 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
770 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
771 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
772 ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
773 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
774 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
775 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
776 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
777 ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
778 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
779 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
780 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB17_1
781 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop
782 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
783 ; GFX9-W64-NEXT: ; return to shader part epilog
785 ; GFX10-W32-LABEL: test_wwm6_loop:
786 ; GFX10-W32: ; %bb.0: ; %main_body
787 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
788 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
789 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
790 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
791 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
792 ; GFX10-W32-NEXT: s_mov_b32 s0, 0
793 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
794 ; GFX10-W32-NEXT: .LBB17_1: ; %loop
795 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
796 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
797 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
798 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
799 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
800 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
801 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
802 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
803 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
804 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
805 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
806 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
807 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
808 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB17_1
809 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop
810 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
811 ; GFX10-W32-NEXT: ; return to shader part epilog
813 %src0 = load volatile float, ptr addrspace(1) undef
814 ; use mbcnt to make sure the branch is divergent
815 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
816 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
820 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
821 %src1 = load volatile float, ptr addrspace(1) undef
822 %out = fadd float %src0, %src1
823 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
824 %counter.1 = sub i32 %counter, 1
825 %cc = icmp ne i32 %counter.1, 0
826 br i1 %cc, label %loop, label %endloop
832 ; Check that @llvm.amdgcn.set.inactive disables WWM.
833 define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
834 ; GFX9-W64-LABEL: test_wwm_set_inactive1:
835 ; GFX9-W64: ; %bb.0: ; %main_body
836 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
837 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
838 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
839 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
840 ; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
841 ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
842 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
843 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
844 ; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
845 ; GFX9-W64-NEXT: s_endpgm
847 ; GFX10-W32-LABEL: test_wwm_set_inactive1:
848 ; GFX10-W32: ; %bb.0: ; %main_body
849 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
850 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
851 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
852 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
853 ; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
854 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
855 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
856 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
857 ; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
858 ; GFX10-W32-NEXT: s_endpgm
860 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
861 %src.0 = bitcast float %src to i32
862 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
863 %out = add i32 %src.1, %src.1
864 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
865 %out.1 = bitcast i32 %out.0 to float
866 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
870 ; Check that Strict WQM is triggered by the strict_wqm intrinsic.
871 define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
872 ; GFX9-W64-LABEL: test_strict_wqm1:
873 ; GFX9-W64: ; %bb.0: ; %main_body
874 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
875 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
876 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
877 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
878 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
879 ; GFX9-W64-NEXT: s_nop 0
880 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
881 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
882 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
883 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
884 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
885 ; GFX9-W64-NEXT: ; return to shader part epilog
887 ; GFX10-W32-LABEL: test_strict_wqm1:
888 ; GFX10-W32: ; %bb.0: ; %main_body
889 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
890 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
891 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
892 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
893 ; GFX10-W32-NEXT: s_clause 0x1
894 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
895 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
896 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
897 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
898 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
899 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
900 ; GFX10-W32-NEXT: ; return to shader part epilog
902 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
903 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
904 %out = fadd float %src0, %src1
905 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
909 ; Same as above, but with an integer type.
910 define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
911 ; GFX9-W64-LABEL: test_strict_wqm2:
912 ; GFX9-W64: ; %bb.0: ; %main_body
913 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
914 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
915 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
916 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
917 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
918 ; GFX9-W64-NEXT: s_nop 0
919 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
920 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
921 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
922 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
923 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
924 ; GFX9-W64-NEXT: ; return to shader part epilog
926 ; GFX10-W32-LABEL: test_strict_wqm2:
927 ; GFX10-W32: ; %bb.0: ; %main_body
928 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
929 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
930 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
931 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
932 ; GFX10-W32-NEXT: s_clause 0x1
933 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
934 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
935 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
936 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
937 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
938 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
939 ; GFX10-W32-NEXT: ; return to shader part epilog
941 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
942 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
943 %src0.0 = bitcast float %src0 to i32
944 %src1.0 = bitcast float %src1 to i32
945 %out = add i32 %src0.0, %src1.0
946 %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
947 %out.1 = bitcast i32 %out.0 to float
951 ; Check that we don't leave Strict WQM on for computations that don't require it,
952 ; since that will lead clobbering things that aren't supposed to be clobbered
953 ; in cases like this.
954 ; We enforce this by checking that v_add gets emitted in the same block as
956 define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
957 ; GFX9-W64-LABEL: test_strict_wqm3:
958 ; GFX9-W64: ; %bb.0: ; %main_body
959 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
960 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
961 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
962 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
963 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
964 ; GFX9-W64-NEXT: s_cbranch_execz .LBB21_2
965 ; GFX9-W64-NEXT: ; %bb.1: ; %if
966 ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
967 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
968 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
969 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
970 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
971 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
972 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
973 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
974 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
975 ; GFX9-W64-NEXT: .LBB21_2: ; %endif
976 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
977 ; GFX9-W64-NEXT: ; return to shader part epilog
979 ; GFX10-W32-LABEL: test_strict_wqm3:
980 ; GFX10-W32: ; %bb.0: ; %main_body
981 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
982 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
983 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
984 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
985 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
986 ; GFX10-W32-NEXT: s_cbranch_execz .LBB21_2
987 ; GFX10-W32-NEXT: ; %bb.1: ; %if
988 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
989 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
990 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
991 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
992 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
993 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
994 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
995 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
996 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
997 ; GFX10-W32-NEXT: .LBB21_2: ; %endif
998 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
999 ; GFX10-W32-NEXT: ; return to shader part epilog
1001 ; use mbcnt to make sure the branch is divergent
1002 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1003 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1004 %cc = icmp uge i32 %hi, 16
1005 br i1 %cc, label %endif, label %if
1008 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1009 %out = fadd float %src, %src
1010 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1011 %out.1 = fadd float %src, %out.0
1015 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
1019 ; Check that Strict WQM writes aren't coalesced with non-strict writes, since
1020 ; the Strict WQM write could clobber disabled channels in the non-strict one.
1021 ; We enforce this by checking that v_mov gets emitted in the same block as
1023 define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
1024 ; GFX9-W64-LABEL: test_strict_wqm4:
1025 ; GFX9-W64: ; %bb.0: ; %main_body
1026 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1027 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1028 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
1029 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
1030 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
1031 ; GFX9-W64-NEXT: s_cbranch_execz .LBB22_2
1032 ; GFX9-W64-NEXT: ; %bb.1: ; %if
1033 ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
1034 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1035 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
1036 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1037 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1038 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
1039 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
1040 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
1041 ; GFX9-W64-NEXT: .LBB22_2: ; %endif
1042 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
1043 ; GFX9-W64-NEXT: ; return to shader part epilog
1045 ; GFX10-W32-LABEL: test_strict_wqm4:
1046 ; GFX10-W32: ; %bb.0: ; %main_body
1047 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1048 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1049 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
1050 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
1051 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
1052 ; GFX10-W32-NEXT: s_cbranch_execz .LBB22_2
1053 ; GFX10-W32-NEXT: ; %bb.1: ; %if
1054 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
1055 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1056 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
1057 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1058 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1059 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
1060 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
1061 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
1062 ; GFX10-W32-NEXT: .LBB22_2: ; %endif
1063 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
1064 ; GFX10-W32-NEXT: ; return to shader part epilog
1066 ; use mbcnt to make sure the branch is divergent
1067 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1068 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1069 %cc = icmp uge i32 %hi, 16
1070 br i1 %cc, label %endif, label %if
1073 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1074 %out = fadd float %src, %src
1075 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1079 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1083 ; Make sure the transition from Exact to Strict WQM then WQM works properly.
1084 define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
1085 ; GFX9-W64-LABEL: test_strict_wqm5:
1086 ; GFX9-W64: ; %bb.0: ; %main_body
1087 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1088 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
1089 ; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
1090 ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
1091 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1092 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1093 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1094 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
1095 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1096 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1097 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
1098 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
1099 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1100 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
1101 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
1102 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
1103 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
1104 ; GFX9-W64-NEXT: ; return to shader part epilog
1106 ; GFX10-W32-LABEL: test_strict_wqm5:
1107 ; GFX10-W32: ; %bb.0: ; %main_body
1108 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
1109 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
1110 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
1111 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1112 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1113 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
1114 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1115 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1116 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1117 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1118 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1119 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1120 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1121 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
1122 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1123 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1124 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
1125 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
1126 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
1127 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
1128 ; GFX10-W32-NEXT: ; return to shader part epilog
1130 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1131 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1132 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1133 %temp = fadd float %src1, %src1
1134 %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
1135 %out = fadd float %temp.0, %temp.0
1136 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
1140 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1141 ; if..then..endif version
1142 ;SI-CHECK: buffer_load_dword
1143 ;VI-CHECK: flat_load_dword
1144 ;SI-CHECK: buffer_load_dword
1145 ;VI-CHECK: flat_load_dword
1146 define amdgpu_ps float @test_strict_wqm6_then() {
1147 ; GFX9-W64-LABEL: test_strict_wqm6_then:
1148 ; GFX9-W64: ; %bb.0: ; %main_body
1149 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1150 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1151 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
1152 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1153 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
1154 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1155 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1156 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
1157 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
1158 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
1159 ; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2
1160 ; GFX9-W64-NEXT: ; %bb.1: ; %if
1161 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1162 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1163 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
1164 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1165 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
1166 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
1167 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
1168 ; GFX9-W64-NEXT: .LBB24_2: ; %endif
1169 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1170 ; GFX9-W64-NEXT: ; return to shader part epilog
1172 ; GFX10-W32-LABEL: test_strict_wqm6_then:
1173 ; GFX10-W32: ; %bb.0: ; %main_body
1174 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1175 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1176 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
1177 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1178 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1179 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1180 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
1181 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
1182 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
1183 ; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
1184 ; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2
1185 ; GFX10-W32-NEXT: ; %bb.1: ; %if
1186 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
1187 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1188 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
1189 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1190 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
1191 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
1192 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
1193 ; GFX10-W32-NEXT: .LBB24_2: ; %endif
1194 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1195 ; GFX10-W32-NEXT: ; return to shader part epilog
1197 %src0 = load volatile float, ptr addrspace(1) undef
1198 ; use mbcnt to make sure the branch is divergent
1199 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1200 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1201 %cc = icmp uge i32 %hi, 16
1202 br i1 %cc, label %endif, label %if
1205 %src1 = load volatile float, ptr addrspace(1) undef
1206 %out = fadd float %src0, %src1
1207 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1211 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1215 ; Check that Strict WQM is turned on correctly across basic block boundaries.
1217 ;SI-CHECK: buffer_load_dword
1218 ;VI-CHECK: flat_load_dword
1219 ;SI-CHECK: buffer_load_dword
1220 ;VI-CHECK: flat_load_dword
1221 define amdgpu_ps float @test_strict_wqm6_loop() {
1222 ; GFX9-W64-LABEL: test_strict_wqm6_loop:
1223 ; GFX9-W64: ; %bb.0: ; %main_body
1224 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1225 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1226 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
1227 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1228 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
1229 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1230 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
1231 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
1232 ; GFX9-W64-NEXT: .LBB25_1: ; %loop
1233 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
1234 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1235 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1236 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
1237 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1238 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
1239 ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
1240 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
1241 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1242 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1243 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
1244 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
1245 ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1246 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
1247 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
1248 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB25_1
1249 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop
1250 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1251 ; GFX9-W64-NEXT: ; return to shader part epilog
1253 ; GFX10-W32-LABEL: test_strict_wqm6_loop:
1254 ; GFX10-W32: ; %bb.0: ; %main_body
1255 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1256 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1257 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
1258 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1259 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
1260 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
1261 ; GFX10-W32-NEXT: s_mov_b32 s0, 0
1262 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
1263 ; GFX10-W32-NEXT: .LBB25_1: ; %loop
1264 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
1265 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
1266 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1267 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
1268 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1269 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
1270 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
1271 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
1272 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1273 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
1274 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
1275 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
1276 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
1277 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
1278 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
1279 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB25_1
1280 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop
1281 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1282 ; GFX10-W32-NEXT: ; return to shader part epilog
1284 %src0 = load volatile float, ptr addrspace(1) undef
1285 ; use mbcnt to make sure the branch is divergent
1286 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1287 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1291 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
1292 %src1 = load volatile float, ptr addrspace(1) undef
1293 %out = fadd float %src0, %src1
1294 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1295 %counter.1 = sub i32 %counter, 1
1296 %cc = icmp ne i32 %counter.1, 0
1297 br i1 %cc, label %loop, label %endloop
1303 ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
1304 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
1305 ; GFX9-W64-LABEL: test_set_inactive2:
1306 ; GFX9-W64: ; %bb.0: ; %main_body
1307 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
1308 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1309 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
1310 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0
1311 ; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
1312 ; GFX9-W64-NEXT: s_nop 0
1313 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
1314 ; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
1315 ; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
1316 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
1317 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1318 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1
1319 ; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1320 ; GFX9-W64-NEXT: s_endpgm
1322 ; GFX10-W32-LABEL: test_set_inactive2:
1323 ; GFX10-W32: ; %bb.0: ; %main_body
1324 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
1325 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1326 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s1
1327 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
1328 ; GFX10-W32-NEXT: s_clause 0x1
1329 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
1330 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
1331 ; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
1332 ; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
1333 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
1334 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1335 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
1336 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1337 ; GFX10-W32-NEXT: s_endpgm
1339 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1340 %src1.0 = bitcast float %src1 to i32
1341 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
1342 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
1343 %src0.0 = bitcast float %src0 to i32
1344 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
1345 %out = add i32 %src0.1, %src1.1
1346 %out.0 = bitcast i32 %out to float
1347 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
1351 ; Check a case of one branch of an if-else requiring WQM, the other requiring
1353 ; Note: In this particular case, the save-and-restore could be avoided if the
1354 ; analysis understood that the two branches of the if-else are mutually
1356 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1357 ; GFX9-W64-LABEL: test_control_flow_0:
1358 ; GFX9-W64: ; %bb.0: ; %main_body
1359 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1360 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1361 ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
1362 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1363 ; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
1364 ; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2
1365 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE
1366 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
1367 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1368 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1369 ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
1370 ; GFX9-W64-NEXT: .LBB27_2: ; %Flow
1371 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
1372 ; GFX9-W64-NEXT: s_cbranch_execz .LBB27_4
1373 ; GFX9-W64-NEXT: ; %bb.3: ; %IF
1374 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1375 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1376 ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1377 ; GFX9-W64-NEXT: .LBB27_4: ; %END
1378 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
1379 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1380 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1381 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
1382 ; GFX9-W64-NEXT: ; return to shader part epilog
1384 ; GFX10-W32-LABEL: test_control_flow_0:
1385 ; GFX10-W32: ; %bb.0: ; %main_body
1386 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1387 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1388 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
1389 ; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
1390 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
1391 ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2
1392 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
1393 ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
1394 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1395 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1396 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
1397 ; GFX10-W32-NEXT: .LBB27_2: ; %Flow
1398 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
1399 ; GFX10-W32-NEXT: s_cbranch_execz .LBB27_4
1400 ; GFX10-W32-NEXT: ; %bb.3: ; %IF
1401 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1402 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1403 ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1404 ; GFX10-W32-NEXT: .LBB27_4: ; %END
1405 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
1406 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1407 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1408 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
1409 ; GFX10-W32-NEXT: ; return to shader part epilog
1411 %cmp = icmp eq i32 %z, 0
1412 br i1 %cmp, label %IF, label %ELSE
1415 %c.bc = bitcast i32 %c to float
1416 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1417 %tex0 = extractelement <4 x float> %tex, i32 0
1418 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1419 %data.if = extractelement <4 x float> %dtex, i32 0
1423 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1427 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1431 ; Reverse branch order compared to the previous test.
1432 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1433 ; GFX9-W64-LABEL: test_control_flow_1:
1434 ; GFX9-W64: ; %bb.0: ; %main_body
1435 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1436 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1437 ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
1438 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1439 ; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
1440 ; GFX9-W64-NEXT: s_cbranch_execz .LBB28_2
1441 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
1442 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1443 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1444 ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1445 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1446 ; GFX9-W64-NEXT: .LBB28_2: ; %Flow
1447 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15]
1448 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1449 ; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1]
1450 ; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1]
1451 ; GFX9-W64-NEXT: s_cbranch_execz .LBB28_4
1452 ; GFX9-W64-NEXT: ; %bb.3: ; %ELSE
1453 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1454 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1455 ; GFX9-W64-NEXT: .LBB28_4: ; %END
1456 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1457 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1458 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
1459 ; GFX9-W64-NEXT: ; return to shader part epilog
1461 ; GFX10-W32-LABEL: test_control_flow_1:
1462 ; GFX10-W32: ; %bb.0: ; %main_body
1463 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1464 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1465 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
1466 ; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
1467 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
1468 ; GFX10-W32-NEXT: s_cbranch_execz .LBB28_2
1469 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
1470 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1471 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1472 ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1473 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1474 ; GFX10-W32-NEXT: .LBB28_2: ; %Flow
1475 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13
1476 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1477 ; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0
1478 ; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
1479 ; GFX10-W32-NEXT: s_cbranch_execz .LBB28_4
1480 ; GFX10-W32-NEXT: ; %bb.3: ; %ELSE
1481 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1482 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1483 ; GFX10-W32-NEXT: .LBB28_4: ; %END
1484 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1485 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1486 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
1487 ; GFX10-W32-NEXT: ; return to shader part epilog
1489 %cmp = icmp eq i32 %z, 0
1490 br i1 %cmp, label %ELSE, label %IF
1493 %c.bc = bitcast i32 %c to float
1494 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1495 %tex0 = extractelement <4 x float> %tex, i32 0
1496 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1497 %data.if = extractelement <4 x float> %dtex, i32 0
1501 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
1505 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1509 ; Check that branch conditions are properly marked as needing WQM...
1510 define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
1511 ; GFX9-W64-LABEL: test_control_flow_2:
1512 ; GFX9-W64: ; %bb.0: ; %main_body
1513 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1514 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1515 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1516 ; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen
1517 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1518 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
1519 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1520 ; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen
1521 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1522 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
1523 ; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
1524 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1525 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1526 ; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
1527 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE
1528 ; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5
1529 ; GFX9-W64-NEXT: ; implicit-def: $vgpr5
1530 ; GFX9-W64-NEXT: ; %bb.2: ; %Flow
1531 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
1532 ; GFX9-W64-NEXT: ; %bb.3: ; %IF
1533 ; GFX9-W64-NEXT: v_lshl_add_u32 v0, v5, 1, v5
1534 ; GFX9-W64-NEXT: ; %bb.4: ; %END
1535 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
1536 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1537 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1538 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1539 ; GFX9-W64-NEXT: ; return to shader part epilog
1541 ; GFX10-W32-LABEL: test_control_flow_2:
1542 ; GFX10-W32: ; %bb.0: ; %main_body
1543 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1544 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1545 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1546 ; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen
1547 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1548 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
1549 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1550 ; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0
1551 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1552 ; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen
1553 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1554 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1555 ; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
1556 ; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
1557 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
1558 ; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5
1559 ; GFX10-W32-NEXT: ; implicit-def: $vgpr5
1560 ; GFX10-W32-NEXT: ; %bb.2: ; %Flow
1561 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
1562 ; GFX10-W32-NEXT: ; %bb.3: ; %IF
1563 ; GFX10-W32-NEXT: v_lshl_add_u32 v0, v5, 1, v5
1564 ; GFX10-W32-NEXT: ; %bb.4: ; %END
1565 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
1566 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1567 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1568 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1569 ; GFX10-W32-NEXT: ; return to shader part epilog
1571 %idx.1 = extractelement <3 x i32> %idx, i32 0
1572 %data.1 = extractelement <2 x float> %data, i32 0
1573 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1575 ; The load that determines the branch (and should therefore be WQM) is
1576 ; surrounded by stores that require disabled WQM.
1577 %idx.2 = extractelement <3 x i32> %idx, i32 1
1578 %z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0)
1580 %idx.3 = extractelement <3 x i32> %idx, i32 2
1581 %data.3 = extractelement <2 x float> %data, i32 1
1582 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0)
1584 %cc = fcmp ogt float %z, 0.0
1585 br i1 %cc, label %IF, label %ELSE
1588 %coord.IF = mul i32 %coord, 3
1592 %coord.ELSE = mul i32 %coord, 4
1596 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
1597 %coord.END.bc = bitcast i32 %coord.END to float
1598 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1599 ret <4 x float> %tex
1602 ; ... but only if they really do need it.
1603 define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
1604 ; GFX9-W64-LABEL: test_control_flow_3:
1605 ; GFX9-W64: ; %bb.0: ; %main_body
1606 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1607 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1608 ; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1609 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1610 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1611 ; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1612 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1613 ; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
1614 ; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1615 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0
1616 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
1617 ; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1618 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_3
1619 ; GFX9-W64-NEXT: ; %bb.1: ; %Flow
1620 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
1621 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_4
1622 ; GFX9-W64-NEXT: .LBB30_2: ; %END
1623 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1624 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1625 ; GFX9-W64-NEXT: s_branch .LBB30_5
1626 ; GFX9-W64-NEXT: .LBB30_3: ; %ELSE
1627 ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1
1628 ; GFX9-W64-NEXT: ; implicit-def: $vgpr1
1629 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
1630 ; GFX9-W64-NEXT: s_cbranch_execz .LBB30_2
1631 ; GFX9-W64-NEXT: .LBB30_4: ; %IF
1632 ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
1633 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
1634 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1635 ; GFX9-W64-NEXT: s_branch .LBB30_5
1636 ; GFX9-W64-NEXT: .LBB30_5:
1638 ; GFX10-W32-LABEL: test_control_flow_3:
1639 ; GFX10-W32: ; %bb.0: ; %main_body
1640 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1641 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1642 ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1643 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1644 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1645 ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1646 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1647 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
1648 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1649 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0
1650 ; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1
1651 ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
1652 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_3
1653 ; GFX10-W32-NEXT: ; %bb.1: ; %Flow
1654 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
1655 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_4
1656 ; GFX10-W32-NEXT: .LBB30_2: ; %END
1657 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1658 ; GFX10-W32-NEXT: s_branch .LBB30_5
1659 ; GFX10-W32-NEXT: .LBB30_3: ; %ELSE
1660 ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1
1661 ; GFX10-W32-NEXT: ; implicit-def: $vgpr1
1662 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
1663 ; GFX10-W32-NEXT: s_cbranch_execz .LBB30_2
1664 ; GFX10-W32-NEXT: .LBB30_4: ; %IF
1665 ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
1666 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
1667 ; GFX10-W32-NEXT: s_branch .LBB30_5
1668 ; GFX10-W32-NEXT: .LBB30_5:
1670 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1671 %tex0 = extractelement <4 x float> %tex, i32 0
1672 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1673 %dtex.1 = extractelement <4 x float> %dtex, i32 0
1674 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1676 %cc = fcmp ogt float %dtex.1, 0.0
1677 br i1 %cc, label %IF, label %ELSE
1680 %tex.IF = fmul float %dtex.1, 3.0
1684 %tex.ELSE = fmul float %dtex.1, 4.0
1688 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
1692 ; Another test that failed at some point because of terminator handling.
1693 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
1694 ; GFX9-W64-LABEL: test_control_flow_4:
1695 ; GFX9-W64: ; %bb.0: ; %main_body
1696 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1697 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1698 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1699 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
1700 ; GFX9-W64-NEXT: s_cbranch_execz .LBB31_2
1701 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
1702 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
1703 ; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0
1704 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 1
1705 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1706 ; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen
1707 ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
1708 ; GFX9-W64-NEXT: .LBB31_2: ; %END
1709 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
1710 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1711 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1712 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1713 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1714 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1715 ; GFX9-W64-NEXT: ; return to shader part epilog
1717 ; GFX10-W32-LABEL: test_control_flow_4:
1718 ; GFX10-W32: ; %bb.0: ; %main_body
1719 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1720 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1721 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
1722 ; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
1723 ; GFX10-W32-NEXT: s_cbranch_execz .LBB31_2
1724 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
1725 ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
1726 ; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0
1727 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 1
1728 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1729 ; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen
1730 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
1731 ; GFX10-W32-NEXT: .LBB31_2: ; %END
1732 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
1733 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1734 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1735 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1736 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1737 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1738 ; GFX10-W32-NEXT: ; return to shader part epilog
1740 %cond = icmp eq i32 %y, 0
1741 br i1 %cond, label %IF, label %END
1744 %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1745 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
1749 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1750 %tex0 = extractelement <4 x float> %tex, i32 0
1751 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1752 ret <4 x float> %dtex
1755 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
1756 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
1757 ; GFX9-W64-LABEL: test_kill_0:
1758 ; GFX9-W64: ; %bb.0: ; %main_body
1759 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1760 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1761 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1762 ; GFX9-W64-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
1763 ; GFX9-W64-NEXT: s_nop 0
1764 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1765 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1766 ; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v6
1767 ; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc
1768 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB32_2
1769 ; GFX9-W64-NEXT: ; %bb.1: ; %main_body
1770 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
1771 ; GFX9-W64-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
1772 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1773 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1774 ; GFX9-W64-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
1775 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1776 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v7, v11
1777 ; GFX9-W64-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen
1778 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v8, v12
1779 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v9, v13
1780 ; GFX9-W64-NEXT: v_add_f32_e32 v3, v10, v14
1781 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1782 ; GFX9-W64-NEXT: s_branch .LBB32_3
1783 ; GFX9-W64-NEXT: .LBB32_2:
1784 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
1785 ; GFX9-W64-NEXT: exp null off, off, off, off done vm
1786 ; GFX9-W64-NEXT: s_endpgm
1787 ; GFX9-W64-NEXT: .LBB32_3:
1789 ; GFX10-W32-LABEL: test_kill_0:
1790 ; GFX10-W32: ; %bb.0: ; %main_body
1791 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1792 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1793 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1794 ; GFX10-W32-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1795 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
1796 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1797 ; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v6
1798 ; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo
1799 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB32_2
1800 ; GFX10-W32-NEXT: ; %bb.1: ; %main_body
1801 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
1802 ; GFX10-W32-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1803 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1804 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1805 ; GFX10-W32-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1806 ; GFX10-W32-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen
1807 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1808 ; GFX10-W32-NEXT: v_add_f32_e32 v4, v8, v12
1809 ; GFX10-W32-NEXT: v_add_f32_e32 v5, v10, v14
1810 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v7, v11
1811 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v9, v13
1812 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v4
1813 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v5
1814 ; GFX10-W32-NEXT: s_branch .LBB32_3
1815 ; GFX10-W32-NEXT: .LBB32_2:
1816 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
1817 ; GFX10-W32-NEXT: exp null off, off, off, off done vm
1818 ; GFX10-W32-NEXT: s_endpgm
1819 ; GFX10-W32-NEXT: .LBB32_3:
1821 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1822 %idx.0 = extractelement <2 x i32> %idx, i32 0
1823 %data.0 = extractelement <2 x float> %data, i32 0
1824 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0)
1826 %z.cmp = fcmp olt float %z, 0.0
1827 call void @llvm.amdgcn.kill(i1 %z.cmp)
1829 %idx.1 = extractelement <2 x i32> %idx, i32 1
1830 %data.1 = extractelement <2 x float> %data, i32 1
1831 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
1832 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1833 %tex2.0 = extractelement <4 x float> %tex2, i32 0
1834 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1835 %out = fadd <4 x float> %tex, %dtex
1837 ret <4 x float> %out
1840 ; ... but only if WQM is necessary.
1841 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
1842 ; GFX9-W64-LABEL: test_kill_1:
1843 ; GFX9-W64: ; %bb.0: ; %main_body
1844 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1845 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1846 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0
1847 ; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1848 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2
1849 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
1850 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1851 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1852 ; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v4
1853 ; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc
1854 ; GFX9-W64-NEXT: buffer_store_dword v5, off, s[0:3], 0
1855 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB33_2
1856 ; GFX9-W64-NEXT: ; %bb.1: ; %main_body
1857 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
1858 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1859 ; GFX9-W64-NEXT: s_branch .LBB33_3
1860 ; GFX9-W64-NEXT: .LBB33_2:
1861 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
1862 ; GFX9-W64-NEXT: exp null off, off, off, off done vm
1863 ; GFX9-W64-NEXT: s_endpgm
1864 ; GFX9-W64-NEXT: .LBB33_3:
1866 ; GFX10-W32-LABEL: test_kill_1:
1867 ; GFX10-W32: ; %bb.0: ; %main_body
1868 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1869 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1870 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0
1871 ; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1872 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2
1873 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
1874 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1875 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1876 ; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v4
1877 ; GFX10-W32-NEXT: buffer_store_dword v5, off, s[0:3], 0
1878 ; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo
1879 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB33_2
1880 ; GFX10-W32-NEXT: ; %bb.1: ; %main_body
1881 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
1882 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1883 ; GFX10-W32-NEXT: s_branch .LBB33_3
1884 ; GFX10-W32-NEXT: .LBB33_2:
1885 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
1886 ; GFX10-W32-NEXT: exp null off, off, off, off done vm
1887 ; GFX10-W32-NEXT: s_endpgm
1888 ; GFX10-W32-NEXT: .LBB33_3:
1890 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1891 %tex0 = extractelement <4 x float> %tex, i32 0
1892 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1894 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
1896 %z.cmp = fcmp olt float %z, 0.0
1897 call void @llvm.amdgcn.kill(i1 %z.cmp)
1899 ret <4 x float> %dtex
1902 ; Check prolog shaders.
1903 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
1904 ; GFX9-W64-LABEL: test_prolog_1:
1905 ; GFX9-W64: ; %bb.0: ; %main_body
1906 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1907 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1908 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
1909 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
1910 ; GFX9-W64-NEXT: ; return to shader part epilog
1912 ; GFX10-W32-LABEL: test_prolog_1:
1913 ; GFX10-W32: ; %bb.0: ; %main_body
1914 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1915 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1916 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
1917 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
1918 ; GFX10-W32-NEXT: ; return to shader part epilog
1920 %s = fadd float %a, %b
1924 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
1925 ; GFX9-W64-LABEL: test_loop_vcc:
1926 ; GFX9-W64: ; %bb.0: ; %entry
1927 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
1928 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1929 ; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3
1930 ; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2
1931 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1
1932 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
1933 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
1934 ; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm
1935 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
1936 ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0
1937 ; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000
1938 ; GFX9-W64-NEXT: s_branch .LBB35_2
1939 ; GFX9-W64-NEXT: .LBB35_1: ; %body
1940 ; GFX9-W64-NEXT: ; in Loop: Header=BB35_2 Depth=1
1941 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
1942 ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8
1943 ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_4
1944 ; GFX9-W64-NEXT: .LBB35_2: ; %loop
1945 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
1946 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1947 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4
1948 ; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8
1949 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5
1950 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6
1951 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7
1952 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB35_1
1953 ; GFX9-W64-NEXT: ; %bb.3:
1954 ; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1955 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8
1956 ; GFX9-W64-NEXT: .LBB35_4: ; %break
1957 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
1958 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
1959 ; GFX9-W64-NEXT: ; return to shader part epilog
1961 ; GFX10-W32-LABEL: test_loop_vcc:
1962 ; GFX10-W32: ; %bb.0: ; %entry
1963 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
1964 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1965 ; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0
1966 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
1967 ; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
1968 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
1969 ; GFX10-W32-NEXT: s_branch .LBB35_2
1970 ; GFX10-W32-NEXT: .p2align 6
1971 ; GFX10-W32-NEXT: .LBB35_1: ; %body
1972 ; GFX10-W32-NEXT: ; in Loop: Header=BB35_2 Depth=1
1973 ; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1974 ; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8
1975 ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_4
1976 ; GFX10-W32-NEXT: .LBB35_2: ; %loop
1977 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
1978 ; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1979 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1980 ; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3
1981 ; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2
1982 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1
1983 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
1984 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB35_1
1985 ; GFX10-W32-NEXT: ; %bb.3:
1986 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1987 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8
1988 ; GFX10-W32-NEXT: .LBB35_4: ; %break
1989 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
1990 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
1991 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4
1992 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5
1993 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6
1994 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7
1995 ; GFX10-W32-NEXT: ; return to shader part epilog
1997 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
2001 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
2002 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
2003 %cc = fcmp ogt float %ctr.iv, 7.0
2004 br i1 %cc, label %break, label %body
2007 %c.iv0 = extractelement <4 x float> %c.iv, i32 0
2008 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2009 %ctr.next = fadd float %ctr.iv, 2.0
2013 ret <4 x float> %c.iv
2016 ; Only intrinsic stores need exact execution -- other stores do not have
2017 ; externally visible effects and may require WQM for correctness.
2018 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
2019 ; GFX9-W64-LABEL: test_alloca:
2020 ; GFX9-W64: ; %bb.0: ; %entry
2021 ; GFX9-W64-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2022 ; GFX9-W64-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2023 ; GFX9-W64-NEXT: s_mov_b32 s10, -1
2024 ; GFX9-W64-NEXT: s_mov_b32 s11, 0xe00000
2025 ; GFX9-W64-NEXT: s_add_u32 s8, s8, s0
2026 ; GFX9-W64-NEXT: s_addc_u32 s9, s9, 0
2027 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
2028 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2029 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
2030 ; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0
2031 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2032 ; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0
2033 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2034 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
2035 ; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1
2036 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
2037 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
2038 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2039 ; GFX9-W64-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf
2040 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1
2041 ; GFX9-W64-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen
2042 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
2043 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2044 ; GFX9-W64-NEXT: s_endpgm
2046 ; GFX10-W32-LABEL: test_alloca:
2047 ; GFX10-W32: ; %bb.0: ; %entry
2048 ; GFX10-W32-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2049 ; GFX10-W32-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2050 ; GFX10-W32-NEXT: s_mov_b32 s10, -1
2051 ; GFX10-W32-NEXT: s_mov_b32 s11, 0x31c16000
2052 ; GFX10-W32-NEXT: s_add_u32 s8, s8, s0
2053 ; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0
2054 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
2055 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2056 ; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 0
2057 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
2058 ; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0
2059 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2060 ; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0
2061 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
2062 ; GFX10-W32-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2063 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
2064 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2065 ; GFX10-W32-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2066 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1
2067 ; GFX10-W32-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen
2068 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2069 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
2070 ; GFX10-W32-NEXT: s_endpgm
2072 %array = alloca [32 x i32], align 4, addrspace(5)
2074 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2076 store volatile i32 %a, ptr addrspace(5) %array, align 4
2078 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
2080 %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx
2081 %c = load i32, ptr addrspace(5) %c.gep, align 4
2082 %c.bc = bitcast i32 %c to float
2083 %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2084 call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
2089 ; Must return to exact at the end of a non-void returning shader,
2090 ; otherwise the EXEC mask exported by the epilog will be wrong. This is true
2091 ; even if the shader has no kills, because a kill could have happened in a
2092 ; previous shader fragment.
2093 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
2094 ; GFX9-W64-LABEL: test_nonvoid_return:
2095 ; GFX9-W64: ; %bb.0:
2096 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
2097 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2098 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2099 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
2100 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2101 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2102 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2103 ; GFX9-W64-NEXT: ; return to shader part epilog
2105 ; GFX10-W32-LABEL: test_nonvoid_return:
2106 ; GFX10-W32: ; %bb.0:
2107 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
2108 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2109 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2110 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
2111 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2112 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2113 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2114 ; GFX10-W32-NEXT: ; return to shader part epilog
2115 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2116 %tex0 = extractelement <4 x float> %tex, i32 0
2117 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2118 ret <4 x float> %dtex
2121 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
2122 ; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
2123 ; GFX9-W64: ; %bb.0: ; %entry
2124 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2125 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2126 ; GFX9-W64-NEXT: s_and_b64 exec, exec, exec
2127 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2128 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2129 ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
2130 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB38_2
2131 ; GFX9-W64-NEXT: ; %bb.1: ; %else
2132 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2133 ; GFX9-W64-NEXT: s_branch .LBB38_3
2134 ; GFX9-W64-NEXT: .LBB38_2: ; %if
2135 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2136 ; GFX9-W64-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
2137 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2138 ; GFX9-W64-NEXT: .LBB38_3:
2140 ; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
2141 ; GFX10-W32: ; %bb.0: ; %entry
2142 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2143 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2144 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo
2145 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2146 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2147 ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
2148 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB38_2
2149 ; GFX10-W32-NEXT: ; %bb.1: ; %else
2150 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2151 ; GFX10-W32-NEXT: s_branch .LBB38_3
2152 ; GFX10-W32-NEXT: .LBB38_2: ; %if
2153 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2154 ; GFX10-W32-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
2155 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
2156 ; GFX10-W32-NEXT: .LBB38_3:
2158 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2159 %tex0 = extractelement <4 x float> %tex, i32 0
2160 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2161 %cc = icmp sgt i32 %c, 0
2162 br i1 %cc, label %if, label %else
2165 store volatile <4 x float> %dtex, ptr addrspace(1) undef
2169 ret <4 x float> %dtex
2172 ; Test awareness that s_wqm_b64 clobbers SCC.
2173 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
2174 ; GFX9-W64-LABEL: test_scc:
2175 ; GFX9-W64: ; %bb.0: ; %main_body
2176 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
2177 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2178 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
2179 ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
2180 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB39_2
2181 ; GFX9-W64-NEXT: ; %bb.1: ; %else
2182 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2183 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1
2184 ; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
2185 ; GFX9-W64-NEXT: s_cbranch_execz .LBB39_3
2186 ; GFX9-W64-NEXT: s_branch .LBB39_4
2187 ; GFX9-W64-NEXT: .LBB39_2:
2188 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2189 ; GFX9-W64-NEXT: .LBB39_3: ; %if
2190 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2191 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2192 ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2193 ; GFX9-W64-NEXT: .LBB39_4: ; %end
2194 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
2195 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0
2196 ; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen
2197 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2198 ; GFX9-W64-NEXT: ; return to shader part epilog
2200 ; GFX10-W32-LABEL: test_scc:
2201 ; GFX10-W32: ; %bb.0: ; %main_body
2202 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
2203 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2204 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
2205 ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
2206 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB39_2
2207 ; GFX10-W32-NEXT: ; %bb.1: ; %else
2208 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1
2209 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2210 ; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
2211 ; GFX10-W32-NEXT: s_cbranch_execz .LBB39_3
2212 ; GFX10-W32-NEXT: s_branch .LBB39_4
2213 ; GFX10-W32-NEXT: .LBB39_2:
2214 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2215 ; GFX10-W32-NEXT: .LBB39_3: ; %if
2216 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2217 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2218 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2219 ; GFX10-W32-NEXT: .LBB39_4: ; %end
2220 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
2221 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0
2222 ; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen
2223 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2224 ; GFX10-W32-NEXT: ; return to shader part epilog
2226 %cc = icmp sgt i32 %sel, 0
2227 br i1 %cc, label %if, label %else
2230 %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2234 %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2238 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
2239 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2243 ; Check a case of a block being entirely WQM except for a bit of WWM.
2244 ; There was a bug where it forgot to enter and leave WWM.
2245 define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2246 ; GFX9-W64-LABEL: test_wwm_within_wqm:
2247 ; GFX9-W64: ; %bb.0: ; %main_body
2248 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2249 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2250 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2251 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
2252 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
2253 ; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2
2254 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
2255 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2256 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2257 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2258 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2259 ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
2260 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2261 ; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2262 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2263 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2264 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2265 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2266 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0
2267 ; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF
2268 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
2269 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2270 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2271 ; GFX9-W64-NEXT: ; return to shader part epilog
2273 ; GFX10-W32-LABEL: test_wwm_within_wqm:
2274 ; GFX10-W32: ; %bb.0: ; %main_body
2275 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2276 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2277 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2278 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
2279 ; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
2280 ; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2
2281 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
2282 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2283 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2284 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2285 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2286 ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
2287 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2288 ; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0
2289 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2290 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2291 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2292 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2293 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0
2294 ; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF
2295 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
2296 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2297 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2298 ; GFX10-W32-NEXT: ; return to shader part epilog
2300 %cmp = icmp eq i32 %z, 0
2301 br i1 %cmp, label %IF, label %ENDIF
2304 %c.bc = bitcast i32 %c to float
2305 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2306 %tex0 = extractelement <4 x float> %tex, i32 0
2307 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2308 %dataf = extractelement <4 x float> %dtex, i32 0
2309 %data1 = fptosi float %dataf to i32
2310 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2311 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2312 %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
2313 %data4f = sitofp i32 %data4 to float
2317 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2321 ; Check that WWM is triggered by the strict_wwm intrinsic.
2322 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
2323 ; GFX9-W64-LABEL: test_strict_wwm1:
2324 ; GFX9-W64: ; %bb.0: ; %main_body
2325 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2326 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2327 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
2328 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2329 ; GFX9-W64-NEXT: s_nop 0
2330 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2331 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2332 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
2333 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2334 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2335 ; GFX9-W64-NEXT: ; return to shader part epilog
2337 ; GFX10-W32-LABEL: test_strict_wwm1:
2338 ; GFX10-W32: ; %bb.0: ; %main_body
2339 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2340 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2341 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
2342 ; GFX10-W32-NEXT: s_clause 0x1
2343 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2344 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2345 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2346 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
2347 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2348 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2349 ; GFX10-W32-NEXT: ; return to shader part epilog
2351 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2352 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2353 %out = fadd float %src0, %src1
2354 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2358 ; Same as above, but with an integer type.
2359 define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
2360 ; GFX9-W64-LABEL: test_strict_wwm2:
2361 ; GFX9-W64: ; %bb.0: ; %main_body
2362 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2363 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2364 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
2365 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2366 ; GFX9-W64-NEXT: s_nop 0
2367 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2368 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2369 ; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
2370 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2371 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2372 ; GFX9-W64-NEXT: ; return to shader part epilog
2374 ; GFX10-W32-LABEL: test_strict_wwm2:
2375 ; GFX10-W32: ; %bb.0: ; %main_body
2376 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2377 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2378 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
2379 ; GFX10-W32-NEXT: s_clause 0x1
2380 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2381 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
2382 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2383 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
2384 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2385 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2386 ; GFX10-W32-NEXT: ; return to shader part epilog
2388 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2389 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2390 %src0.0 = bitcast float %src0 to i32
2391 %src1.0 = bitcast float %src1 to i32
2392 %out = add i32 %src0.0, %src1.0
2393 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2394 %out.1 = bitcast i32 %out.0 to float
2398 ; Check that we don't leave WWM on for computations that don't require WWM,
2399 ; since that will lead clobbering things that aren't supposed to be clobbered
2400 ; in cases like this.
2401 ; We enforce this by checking that v_add gets emitted in the same block as
2403 define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
2404 ; GFX9-W64-LABEL: test_strict_wwm3:
2405 ; GFX9-W64: ; %bb.0: ; %main_body
2406 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2407 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2408 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
2409 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2410 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
2411 ; GFX9-W64-NEXT: s_cbranch_execz .LBB43_2
2412 ; GFX9-W64-NEXT: ; %bb.1: ; %if
2413 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
2414 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2415 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2416 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2417 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
2418 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
2419 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2420 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
2421 ; GFX9-W64-NEXT: .LBB43_2: ; %endif
2422 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
2423 ; GFX9-W64-NEXT: ; return to shader part epilog
2425 ; GFX10-W32-LABEL: test_strict_wwm3:
2426 ; GFX10-W32: ; %bb.0: ; %main_body
2427 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2428 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2429 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
2430 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2431 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
2432 ; GFX10-W32-NEXT: s_cbranch_execz .LBB43_2
2433 ; GFX10-W32-NEXT: ; %bb.1: ; %if
2434 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2435 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2436 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2437 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2438 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
2439 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2440 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2441 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
2442 ; GFX10-W32-NEXT: .LBB43_2: ; %endif
2443 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
2444 ; GFX10-W32-NEXT: ; return to shader part epilog
2446 ; use mbcnt to make sure the branch is divergent
2447 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2448 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2449 %cc = icmp uge i32 %hi, 16
2450 br i1 %cc, label %endif, label %if
2453 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2454 %out = fadd float %src, %src
2455 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2456 %out.1 = fadd float %src, %out.0
2460 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2464 ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
2465 ; write could clobber disabled channels in the non-WWM one.
2466 ; We enforce this by checking that v_mov gets emitted in the same block as
2468 define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
2469 ; GFX9-W64-LABEL: test_strict_wwm4:
2470 ; GFX9-W64: ; %bb.0: ; %main_body
2471 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2472 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2473 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
2474 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2475 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
2476 ; GFX9-W64-NEXT: s_cbranch_execz .LBB44_2
2477 ; GFX9-W64-NEXT: ; %bb.1: ; %if
2478 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
2479 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2480 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2481 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2482 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
2483 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
2484 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2485 ; GFX9-W64-NEXT: .LBB44_2: ; %endif
2486 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
2487 ; GFX9-W64-NEXT: ; return to shader part epilog
2489 ; GFX10-W32-LABEL: test_strict_wwm4:
2490 ; GFX10-W32: ; %bb.0: ; %main_body
2491 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2492 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2493 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
2494 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2495 ; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
2496 ; GFX10-W32-NEXT: s_cbranch_execz .LBB44_2
2497 ; GFX10-W32-NEXT: ; %bb.1: ; %if
2498 ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
2499 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2500 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2501 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2502 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
2503 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
2504 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2505 ; GFX10-W32-NEXT: .LBB44_2: ; %endif
2506 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
2507 ; GFX10-W32-NEXT: ; return to shader part epilog
2509 ; use mbcnt to make sure the branch is divergent
2510 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2511 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2512 %cc = icmp uge i32 %hi, 16
2513 br i1 %cc, label %endif, label %if
2516 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2517 %out = fadd float %src, %src
2518 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2522 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2526 ; Make sure the transition from Exact to WWM then WQM works properly.
2527 define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
2528 ; GFX9-W64-LABEL: test_strict_wwm5:
2529 ; GFX9-W64: ; %bb.0: ; %main_body
2530 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
2531 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
2532 ; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
2533 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2534 ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
2535 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
2536 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
2537 ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2538 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2539 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
2540 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
2541 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2542 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2543 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
2544 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2545 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
2546 ; GFX9-W64-NEXT: ; return to shader part epilog
2548 ; GFX10-W32-LABEL: test_strict_wwm5:
2549 ; GFX10-W32: ; %bb.0: ; %main_body
2550 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
2551 ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
2552 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
2553 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2554 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
2555 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2556 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2557 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
2558 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2559 ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
2560 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2561 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
2562 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2563 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2564 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2565 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
2566 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2567 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
2568 ; GFX10-W32-NEXT: ; return to shader part epilog
2570 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2571 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2572 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2573 %temp = fadd float %src1, %src1
2574 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
2575 %out = fadd float %temp.0, %temp.0
2576 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
2580 ; Check that WWM is turned on correctly across basic block boundaries.
2581 ; if..then..endif version
2582 ;SI-CHECK: buffer_load_dword
2583 ;VI-CHECK: flat_load_dword
2584 ;SI-CHECK: buffer_load_dword
2585 ;VI-CHECK: flat_load_dword
2586 define amdgpu_ps float @test_strict_wwm6_then() {
2587 ; GFX9-W64-LABEL: test_strict_wwm6_then:
2588 ; GFX9-W64: ; %bb.0: ; %main_body
2589 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2590 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
2591 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2592 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2593 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2594 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2595 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
2596 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2597 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
2598 ; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2
2599 ; GFX9-W64-NEXT: ; %bb.1: ; %if
2600 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2601 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
2602 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2603 ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
2604 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2605 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2606 ; GFX9-W64-NEXT: .LBB46_2: ; %endif
2607 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
2608 ; GFX9-W64-NEXT: ; return to shader part epilog
2610 ; GFX10-W32-LABEL: test_strict_wwm6_then:
2611 ; GFX10-W32: ; %bb.0: ; %main_body
2612 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2613 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
2614 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2615 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2616 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2617 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
2618 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
2619 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2620 ; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
2621 ; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2
2622 ; GFX10-W32-NEXT: ; %bb.1: ; %if
2623 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
2624 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
2625 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2626 ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
2627 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
2628 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2629 ; GFX10-W32-NEXT: .LBB46_2: ; %endif
2630 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
2631 ; GFX10-W32-NEXT: ; return to shader part epilog
2633 %src0 = load volatile float, ptr addrspace(1) undef
2634 ; use mbcnt to make sure the branch is divergent
2635 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2636 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2637 %cc = icmp uge i32 %hi, 16
2638 br i1 %cc, label %endif, label %if
2641 %src1 = load volatile float, ptr addrspace(1) undef
2642 %out = fadd float %src0, %src1
2643 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2647 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2651 ; Check that WWM is turned on correctly across basic block boundaries.
2653 define amdgpu_ps float @test_strict_wwm6_loop() {
2654 ; GFX9-W64-LABEL: test_strict_wwm6_loop:
2655 ; GFX9-W64: ; %bb.0: ; %main_body
2656 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2657 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
2658 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2659 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2660 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2661 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
2662 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
2663 ; GFX9-W64-NEXT: .LBB47_1: ; %loop
2664 ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
2665 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2666 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
2667 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2668 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2669 ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
2670 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
2671 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
2672 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
2673 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
2674 ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2675 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2676 ; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
2677 ; GFX9-W64-NEXT: s_cbranch_execnz .LBB47_1
2678 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop
2679 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
2680 ; GFX9-W64-NEXT: ; return to shader part epilog
2682 ; GFX10-W32-LABEL: test_strict_wwm6_loop:
2683 ; GFX10-W32: ; %bb.0: ; %main_body
2684 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2685 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
2686 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2687 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2688 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
2689 ; GFX10-W32-NEXT: s_mov_b32 s0, 0
2690 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
2691 ; GFX10-W32-NEXT: .LBB47_1: ; %loop
2692 ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
2693 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
2694 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
2695 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2696 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
2697 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
2698 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
2699 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
2700 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
2701 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
2702 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2703 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
2704 ; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
2705 ; GFX10-W32-NEXT: s_cbranch_execnz .LBB47_1
2706 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop
2707 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
2708 ; GFX10-W32-NEXT: ; return to shader part epilog
2710 %src0 = load volatile float, ptr addrspace(1) undef
2711 ; use mbcnt to make sure the branch is divergent
2712 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2713 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2717 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
2718 %src1 = load volatile float, ptr addrspace(1) undef
2719 %out = fadd float %src0, %src1
2720 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2721 %counter.1 = sub i32 %counter, 1
2722 %cc = icmp ne i32 %counter.1, 0
2723 br i1 %cc, label %loop, label %endloop
2729 ; Check that @llvm.amdgcn.set.inactive disables WWM.
2730 define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
2731 ; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
2732 ; GFX9-W64: ; %bb.0: ; %main_body
2733 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
2734 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
2735 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2736 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2737 ; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
2738 ; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
2739 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2740 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2741 ; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
2742 ; GFX9-W64-NEXT: s_endpgm
2744 ; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
2745 ; GFX10-W32: ; %bb.0: ; %main_body
2746 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
2747 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
2748 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2749 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2750 ; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
2751 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
2752 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2753 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2754 ; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
2755 ; GFX10-W32-NEXT: s_endpgm
2757 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2758 %src.0 = bitcast float %src to i32
2759 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
2760 %out = add i32 %src.1, %src.1
2761 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2762 %out.1 = bitcast i32 %out.0 to float
2763 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2767 ; Check a case of a block being entirely WQM except for a bit of WWM.
2768 ; There was a bug where it forgot to enter and leave WWM.
2769 define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2770 ; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
2771 ; GFX9-W64: ; %bb.0: ; %main_body
2772 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2773 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2774 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2775 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
2776 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
2777 ; GFX9-W64-NEXT: s_cbranch_execz .LBB49_2
2778 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
2779 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2780 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2781 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2782 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2783 ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
2784 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
2785 ; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
2786 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2787 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2788 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2789 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2790 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0
2791 ; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF
2792 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
2793 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2794 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
2795 ; GFX9-W64-NEXT: ; return to shader part epilog
2797 ; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
2798 ; GFX10-W32: ; %bb.0: ; %main_body
2799 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2800 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2801 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2802 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
2803 ; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
2804 ; GFX10-W32-NEXT: s_cbranch_execz .LBB49_2
2805 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
2806 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2807 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2808 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2809 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2810 ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
2811 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
2812 ; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0
2813 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2814 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2815 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2816 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2817 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0
2818 ; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF
2819 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
2820 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2821 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
2822 ; GFX10-W32-NEXT: ; return to shader part epilog
2824 %cmp = icmp eq i32 %z, 0
2825 br i1 %cmp, label %IF, label %ENDIF
2828 %c.bc = bitcast i32 %c to float
2829 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2830 %tex0 = extractelement <4 x float> %tex, i32 0
2831 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2832 %dataf = extractelement <4 x float> %dtex, i32 0
2833 %data1 = fptosi float %dataf to i32
2834 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2835 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2836 %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
2837 %data4f = sitofp i32 %data4 to float
2841 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2845 ; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
2846 define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2847 ; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
2848 ; GFX9-W64: ; %bb.0: ; %main_body
2849 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2850 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2851 ; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
2852 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2853 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
2854 ; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
2855 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2856 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
2857 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
2858 ; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2
2859 ; GFX9-W64-NEXT: ; %bb.1: ; %IF
2860 ; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec
2861 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2862 ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2863 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2864 ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2865 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2866 ; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
2867 ; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2868 ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
2869 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2870 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
2871 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
2872 ; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF
2873 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
2874 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2875 ; GFX9-W64-NEXT: ; return to shader part epilog
2877 ; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
2878 ; GFX10-W32: ; %bb.0: ; %main_body
2879 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2880 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2881 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
2882 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2883 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
2884 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
2885 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
2886 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
2887 ; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
2888 ; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2
2889 ; GFX10-W32-NEXT: ; %bb.1: ; %IF
2890 ; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
2891 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2892 ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2893 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2894 ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2895 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2896 ; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
2897 ; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2898 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
2899 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2900 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
2901 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
2902 ; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF
2903 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
2904 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2905 ; GFX10-W32-NEXT: ; return to shader part epilog
2907 %cmp = icmp eq i32 %z, 0
2908 br i1 %cmp, label %IF, label %ENDIF
2911 %c.bc = bitcast i32 %c to float
2912 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2913 %tex0 = extractelement <4 x float> %tex, i32 0
2914 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2915 %dataf = extractelement <4 x float> %dtex, i32 0
2916 %data1 = fptosi float %dataf to i32
2917 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
2918 %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
2919 %data3f = sitofp i32 %data3 to float
2923 %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
2927 ; WQM -> StrictWQM transition must be preserved because kill breaks WQM mask
2928 define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data, i32 %wqm_data) {
2929 ; GFX9-W64-LABEL: test_strict_wqm_within_wqm_with_kill:
2930 ; GFX9-W64: ; %bb.0: ; %main_body
2931 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2932 ; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
2933 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2934 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
2935 ; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
2936 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2937 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2938 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2939 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2940 ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2941 ; GFX9-W64-NEXT: s_andn2_b64 s[0:1], exec, vcc
2942 ; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[0:1]
2943 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB51_2
2944 ; GFX9-W64-NEXT: ; %bb.1: ; %main_body
2945 ; GFX9-W64-NEXT: s_and_b64 exec, exec, vcc
2946 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
2947 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2948 ; GFX9-W64-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2949 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
2950 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2951 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v3
2952 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v1
2953 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2954 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
2955 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2956 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2957 ; GFX9-W64-NEXT: s_branch .LBB51_3
2958 ; GFX9-W64-NEXT: .LBB51_2:
2959 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
2960 ; GFX9-W64-NEXT: exp null off, off, off, off done vm
2961 ; GFX9-W64-NEXT: s_endpgm
2962 ; GFX9-W64-NEXT: .LBB51_3:
2964 ; GFX10-W32-LABEL: test_strict_wqm_within_wqm_with_kill:
2965 ; GFX10-W32: ; %bb.0: ; %main_body
2966 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2967 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
2968 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2969 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2
2970 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
2971 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2972 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2973 ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2974 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2975 ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2976 ; GFX10-W32-NEXT: s_andn2_b32 s0, exec_lo, vcc_lo
2977 ; GFX10-W32-NEXT: s_andn2_b32 s12, s12, s0
2978 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB51_2
2979 ; GFX10-W32-NEXT: ; %bb.1: ; %main_body
2980 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
2981 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
2982 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2983 ; GFX10-W32-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2984 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
2985 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2986 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v3
2987 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v1
2988 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2989 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
2990 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2991 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2992 ; GFX10-W32-NEXT: s_branch .LBB51_3
2993 ; GFX10-W32-NEXT: .LBB51_2:
2994 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
2995 ; GFX10-W32-NEXT: exp null off, off, off, off done vm
2996 ; GFX10-W32-NEXT: s_endpgm
2997 ; GFX10-W32-NEXT: .LBB51_3:
2999 %c.bc = bitcast i32 %c to float
3000 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3001 %tex0 = extractelement <4 x float> %tex, i32 0
3002 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3003 %cmp = icmp eq i32 %z, 0
3004 call void @llvm.amdgcn.kill(i1 %cmp)
3005 %dataf = extractelement <4 x float> %dtex, i32 0
3006 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %wqm_data, i32 2079)
3007 %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
3008 %data3f = sitofp i32 %data3 to float
3009 %result.f = fadd float %dataf, %data3f
3010 %result.i = bitcast float %result.f to i32
3011 %result.wqm = call i32 @llvm.amdgcn.wqm.i32(i32 %result.i)
3012 %result = bitcast i32 %result.wqm to float
3016 ;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
3017 define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) {
3018 ; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
3019 ; GFX9-W64: ; %bb.0: ; %main_body
3020 ; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec
3021 ; GFX9-W64-NEXT: s_mov_b32 s19, s17
3022 ; GFX9-W64-NEXT: s_mov_b64 s[30:31], exec
3023 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3024 ; GFX9-W64-NEXT: s_mov_b32 s23, s5
3025 ; GFX9-W64-NEXT: s_mov_b32 s22, s4
3026 ; GFX9-W64-NEXT: s_mov_b32 s21, s3
3027 ; GFX9-W64-NEXT: s_mov_b32 s20, s2
3028 ; GFX9-W64-NEXT: s_mov_b32 s27, s9
3029 ; GFX9-W64-NEXT: s_mov_b32 s26, s8
3030 ; GFX9-W64-NEXT: s_mov_b32 s25, s7
3031 ; GFX9-W64-NEXT: s_mov_b32 s24, s6
3032 ; GFX9-W64-NEXT: s_mov_b32 s18, s16
3033 ; GFX9-W64-NEXT: s_mov_b32 s17, s15
3034 ; GFX9-W64-NEXT: s_mov_b32 s16, s14
3035 ; GFX9-W64-NEXT: s_mov_b32 s15, s13
3036 ; GFX9-W64-NEXT: s_mov_b32 s14, s12
3037 ; GFX9-W64-NEXT: s_mov_b32 s13, s11
3038 ; GFX9-W64-NEXT: s_mov_b32 s12, s10
3039 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
3040 ; GFX9-W64-NEXT: s_mov_b64 exec, s[30:31]
3041 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
3042 ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
3043 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3044 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen
3045 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
3046 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
3047 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, s0
3048 ; GFX9-W64-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen
3049 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
3050 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
3051 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3052 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3053 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2
3054 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
3055 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3056 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
3057 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
3058 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3059 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3
3060 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4
3061 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[28:29]
3062 ; GFX9-W64-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
3063 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3064 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
3065 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen
3066 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3067 ; GFX9-W64-NEXT: ; return to shader part epilog
3069 ; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
3070 ; GFX10-W32: ; %bb.0: ; %main_body
3071 ; GFX10-W32-NEXT: s_mov_b32 s28, exec_lo
3072 ; GFX10-W32-NEXT: s_mov_b32 s19, s17
3073 ; GFX10-W32-NEXT: s_mov_b32 s29, exec_lo
3074 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3075 ; GFX10-W32-NEXT: s_mov_b32 s23, s5
3076 ; GFX10-W32-NEXT: s_mov_b32 s22, s4
3077 ; GFX10-W32-NEXT: s_mov_b32 s21, s3
3078 ; GFX10-W32-NEXT: s_mov_b32 s20, s2
3079 ; GFX10-W32-NEXT: s_mov_b32 s27, s9
3080 ; GFX10-W32-NEXT: s_mov_b32 s26, s8
3081 ; GFX10-W32-NEXT: s_mov_b32 s25, s7
3082 ; GFX10-W32-NEXT: s_mov_b32 s24, s6
3083 ; GFX10-W32-NEXT: s_mov_b32 s18, s16
3084 ; GFX10-W32-NEXT: s_mov_b32 s17, s15
3085 ; GFX10-W32-NEXT: s_mov_b32 s16, s14
3086 ; GFX10-W32-NEXT: s_mov_b32 s15, s13
3087 ; GFX10-W32-NEXT: s_mov_b32 s14, s12
3088 ; GFX10-W32-NEXT: s_mov_b32 s13, s11
3089 ; GFX10-W32-NEXT: s_mov_b32 s12, s10
3090 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
3091 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s29
3092 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
3093 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
3094 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3095 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen
3096 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
3097 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
3098 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, s0
3099 ; GFX10-W32-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen
3100 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
3101 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3102 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3103 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3104 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2
3105 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3106 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3107 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
3108 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3109 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3
3110 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3111 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4
3112 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s28
3113 ; GFX10-W32-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
3114 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3115 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
3116 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen
3117 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3118 ; GFX10-W32-NEXT: ; return to shader part epilog
3120 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3121 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3122 %temp = fadd float %reload, %reload
3123 %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
3124 %temp3 = fadd float %temp2, %temp2
3125 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0)
3126 %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
3127 %temp5 = fadd float %temp3, %temp4
3128 %res.int = ptrtoint ptr addrspace(8) %res to i128
3129 %res.vec = bitcast i128 %res.int to <4 x i32>
3130 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3131 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3132 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3136 define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3137 ; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
3138 ; GFX9-W64: ; %bb.0: ; %main_body
3139 ; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec
3140 ; GFX9-W64-NEXT: s_mov_b32 s15, s13
3141 ; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec
3142 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3143 ; GFX9-W64-NEXT: s_mov_b32 s19, s5
3144 ; GFX9-W64-NEXT: s_mov_b32 s18, s4
3145 ; GFX9-W64-NEXT: s_mov_b32 s17, s3
3146 ; GFX9-W64-NEXT: s_mov_b32 s16, s2
3147 ; GFX9-W64-NEXT: s_mov_b32 s14, s12
3148 ; GFX9-W64-NEXT: s_mov_b32 s13, s11
3149 ; GFX9-W64-NEXT: s_mov_b32 s12, s10
3150 ; GFX9-W64-NEXT: s_mov_b32 s11, s9
3151 ; GFX9-W64-NEXT: s_mov_b32 s10, s8
3152 ; GFX9-W64-NEXT: s_mov_b32 s9, s7
3153 ; GFX9-W64-NEXT: s_mov_b32 s8, s6
3154 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
3155 ; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23]
3156 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3157 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
3158 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
3159 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen
3160 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
3161 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
3162 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3163 ; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen
3164 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
3165 ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
3166 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3167 ; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2
3168 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
3169 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3170 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
3171 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
3172 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3173 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3
3174 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4
3175 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
3176 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3177 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3178 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3179 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3180 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3181 ; GFX9-W64-NEXT: ; return to shader part epilog
3183 ; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
3184 ; GFX10-W32: ; %bb.0: ; %main_body
3185 ; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo
3186 ; GFX10-W32-NEXT: s_mov_b32 s15, s13
3187 ; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo
3188 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3189 ; GFX10-W32-NEXT: s_mov_b32 s19, s5
3190 ; GFX10-W32-NEXT: s_mov_b32 s18, s4
3191 ; GFX10-W32-NEXT: s_mov_b32 s17, s3
3192 ; GFX10-W32-NEXT: s_mov_b32 s16, s2
3193 ; GFX10-W32-NEXT: s_mov_b32 s14, s12
3194 ; GFX10-W32-NEXT: s_mov_b32 s13, s11
3195 ; GFX10-W32-NEXT: s_mov_b32 s12, s10
3196 ; GFX10-W32-NEXT: s_mov_b32 s11, s9
3197 ; GFX10-W32-NEXT: s_mov_b32 s10, s8
3198 ; GFX10-W32-NEXT: s_mov_b32 s9, s7
3199 ; GFX10-W32-NEXT: s_mov_b32 s8, s6
3200 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
3201 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21
3202 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
3203 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
3204 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3205 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3206 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
3207 ; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen
3208 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3209 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3210 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3211 ; GFX10-W32-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen
3212 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3213 ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
3214 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3215 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2
3216 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3217 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3218 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
3219 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3220 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3
3221 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3222 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4
3223 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
3224 ; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3225 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3226 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3227 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3228 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3229 ; GFX10-W32-NEXT: ; return to shader part epilog
3231 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3232 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3233 %temp = fadd float %reload, %reload
3234 %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
3235 %temp3 = fadd float %temp2, %temp2
3236 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3237 %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3238 %temp5 = fadd float %temp3, %temp4
3239 %res.int = ptrtoint ptr addrspace(8) %res to i128
3240 %res.vec = bitcast i128 %res.int to <4 x i32>
3241 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3242 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3243 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3247 ;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
3248 define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
3249 ; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
3250 ; GFX9-W64: ; %bb.0: ; %main_body
3251 ; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec
3252 ; GFX9-W64-NEXT: s_mov_b32 s15, s13
3253 ; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec
3254 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3255 ; GFX9-W64-NEXT: s_mov_b32 s19, s5
3256 ; GFX9-W64-NEXT: s_mov_b32 s18, s4
3257 ; GFX9-W64-NEXT: s_mov_b32 s17, s3
3258 ; GFX9-W64-NEXT: s_mov_b32 s16, s2
3259 ; GFX9-W64-NEXT: s_mov_b32 s14, s12
3260 ; GFX9-W64-NEXT: s_mov_b32 s13, s11
3261 ; GFX9-W64-NEXT: s_mov_b32 s12, s10
3262 ; GFX9-W64-NEXT: s_mov_b32 s11, s9
3263 ; GFX9-W64-NEXT: s_mov_b32 s10, s8
3264 ; GFX9-W64-NEXT: s_mov_b32 s9, s7
3265 ; GFX9-W64-NEXT: s_mov_b32 s8, s6
3266 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
3267 ; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23]
3268 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3269 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3270 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
3271 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
3272 ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
3273 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3274 ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3275 ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
3276 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3277 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
3278 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3279 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3280 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
3281 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3282 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
3283 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3
3284 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
3285 ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3286 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3287 ; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3288 ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3289 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3290 ; GFX9-W64-NEXT: ; return to shader part epilog
3292 ; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
3293 ; GFX10-W32: ; %bb.0: ; %main_body
3294 ; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo
3295 ; GFX10-W32-NEXT: s_mov_b32 s15, s13
3296 ; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo
3297 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3298 ; GFX10-W32-NEXT: s_mov_b32 s19, s5
3299 ; GFX10-W32-NEXT: s_mov_b32 s18, s4
3300 ; GFX10-W32-NEXT: s_mov_b32 s17, s3
3301 ; GFX10-W32-NEXT: s_mov_b32 s16, s2
3302 ; GFX10-W32-NEXT: s_mov_b32 s14, s12
3303 ; GFX10-W32-NEXT: s_mov_b32 s13, s11
3304 ; GFX10-W32-NEXT: s_mov_b32 s12, s10
3305 ; GFX10-W32-NEXT: s_mov_b32 s11, s9
3306 ; GFX10-W32-NEXT: s_mov_b32 s10, s8
3307 ; GFX10-W32-NEXT: s_mov_b32 s9, s7
3308 ; GFX10-W32-NEXT: s_mov_b32 s8, s6
3309 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
3310 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21
3311 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3312 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, s1
3313 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
3314 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3315 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3316 ; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
3317 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3318 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3319 ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
3320 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
3321 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3322 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3323 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3324 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2
3325 ; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3326 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3327 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
3328 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3
3329 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
3330 ; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3331 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3332 ; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
3333 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
3334 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3335 ; GFX10-W32-NEXT: ; return to shader part epilog
3337 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3338 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
3339 %temp = fadd float %reload, %reload
3340 %res.int = ptrtoint ptr addrspace(8) %res to i128
3341 %res.vec = bitcast i128 %res.int to <4 x i32>
3342 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3343 %temp2 = fadd float %tex, %tex
3344 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3345 %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3346 %temp4 = fadd float %temp2, %temp3
3347 %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
3348 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3349 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
3353 ; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
3354 ; vector comparisons in Wave32 mode.
3355 define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) {
3356 ; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
3357 ; GFX9-W64: ; %bb.0: ; %main_body
3358 ; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac
3359 ; GFX9-W64-NEXT: s_mov_b32 s2, 32
3360 ; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000
3361 ; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
3362 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3363 ; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
3364 ; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
3365 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB55_1
3366 ; GFX9-W64-NEXT: s_endpgm
3367 ; GFX9-W64-NEXT: .LBB55_1:
3368 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
3369 ; GFX9-W64-NEXT: exp null off, off, off, off done vm
3370 ; GFX9-W64-NEXT: s_endpgm
3372 ; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
3373 ; GFX10-W32: ; %bb.0: ; %main_body
3374 ; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac
3375 ; GFX10-W32-NEXT: s_mov_b32 s2, 32
3376 ; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000
3377 ; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
3378 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3379 ; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
3380 ; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
3381 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB55_1
3382 ; GFX10-W32-NEXT: s_endpgm
3383 ; GFX10-W32-NEXT: .LBB55_1:
3384 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3385 ; GFX10-W32-NEXT: exp null off, off, off, off done vm
3386 ; GFX10-W32-NEXT: s_endpgm
3388 %1 = ptrtoint ptr addrspace(6) %0 to i32
3389 %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
3390 %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
3391 %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
3392 call void @llvm.amdgcn.kill(i1 %4) #1
3396 ; Test the interaction between wqm and llvm.amdgcn.init.exec.
3397 define amdgpu_gs void @wqm_init_exec() {
3398 ; GFX9-W64-LABEL: wqm_init_exec:
3399 ; GFX9-W64: ; %bb.0: ; %bb
3400 ; GFX9-W64-NEXT: s_mov_b64 exec, -1
3401 ; GFX9-W64-NEXT: s_mov_b32 s0, 0
3402 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
3403 ; GFX9-W64-NEXT: s_mov_b32 s1, s0
3404 ; GFX9-W64-NEXT: s_mov_b32 s2, s0
3405 ; GFX9-W64-NEXT: s_mov_b32 s3, s0
3406 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v0
3407 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
3408 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0
3409 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3410 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3411 ; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec
3412 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
3413 ; GFX9-W64-NEXT: ds_write_b32 v0, v1
3414 ; GFX9-W64-NEXT: s_endpgm
3416 ; GFX10-W32-LABEL: wqm_init_exec:
3417 ; GFX10-W32: ; %bb.0: ; %bb
3418 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1
3419 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
3420 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
3421 ; GFX10-W32-NEXT: s_mov_b32 s0, 0
3422 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3423 ; GFX10-W32-NEXT: s_mov_b32 s2, s0
3424 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
3425 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0
3426 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
3427 ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0
3428 ; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0
3429 ; GFX10-W32-NEXT: s_mov_b32 s1, s0
3430 ; GFX10-W32-NEXT: s_mov_b32 s3, s0
3431 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3432 ; GFX10-W32-NEXT: ds_write_b32 v0, v4
3433 ; GFX10-W32-NEXT: s_endpgm
3435 call void @llvm.amdgcn.init.exec(i64 -1)
3436 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
3437 %i = call i32 @llvm.amdgcn.wqm.i32(i32 0)
3438 store i32 %i, i32 addrspace(3)* null, align 4
3442 ; Test a case that failed machine verification.
3443 define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) {
3444 ; GFX9-W64-LABEL: wqm_init_exec_switch:
3445 ; GFX9-W64: ; %bb.0:
3446 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
3447 ; GFX9-W64-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
3448 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
3449 ; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3450 ; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
3451 ; GFX9-W64-NEXT: s_endpgm
3453 ; GFX10-W32-LABEL: wqm_init_exec_switch:
3454 ; GFX10-W32: ; %bb.0:
3455 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3456 ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3457 ; GFX10-W32-NEXT: v_cmpx_lt_i32_e32 0, v0
3458 ; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
3459 ; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
3460 ; GFX10-W32-NEXT: s_endpgm
3461 call void @llvm.amdgcn.init.exec(i64 0)
3462 switch i32 %arg, label %bb1 [
3474 define amdgpu_gs void @wqm_init_exec_wwm() {
3475 ; GFX9-W64-LABEL: wqm_init_exec_wwm:
3476 ; GFX9-W64: ; %bb.0:
3477 ; GFX9-W64-NEXT: s_mov_b64 exec, 0
3478 ; GFX9-W64-NEXT: s_mov_b32 s1, 0
3479 ; GFX9-W64-NEXT: s_mov_b32 s0, s1
3480 ; GFX9-W64-NEXT: s_cmp_lg_u64 exec, 0
3481 ; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0
3482 ; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0
3483 ; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0
3484 ; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
3485 ; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
3486 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
3487 ; GFX9-W64-NEXT: exp mrt0 off, off, off, off
3488 ; GFX9-W64-NEXT: s_endpgm
3490 ; GFX10-W32-LABEL: wqm_init_exec_wwm:
3491 ; GFX10-W32: ; %bb.0:
3492 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3493 ; GFX10-W32-NEXT: s_mov_b32 s1, 0
3494 ; GFX10-W32-NEXT: s_cmp_lg_u64 exec, 0
3495 ; GFX10-W32-NEXT: s_mov_b32 s0, s1
3496 ; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0
3497 ; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0
3498 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
3499 ; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0
3500 ; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0
3501 ; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
3502 ; GFX10-W32-NEXT: exp mrt0 off, off, off, off
3503 ; GFX10-W32-NEXT: s_endpgm
3504 call void @llvm.amdgcn.init.exec(i64 0)
3505 %i = call i64 @llvm.amdgcn.ballot.i64(i1 true)
3506 %i1 = call i32 @llvm.amdgcn.wwm.i32(i32 0)
3507 %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 0
3508 %i3 = bitcast <2 x i32> %i2 to i64
3509 %i4 = icmp ne i64 %i, 0
3510 %i5 = icmp ne i64 %i3, 0
3511 %i6 = xor i1 %i4, %i5
3512 %i7 = uitofp i1 %i6 to float
3513 call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %i7, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
3517 ; Check that exact regions with execz affected instructions are as short as possible
3518 define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3519 ; GFX9-W64-LABEL: short_exact_regions:
3520 ; GFX9-W64: ; %bb.0: ; %main_body
3521 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
3522 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3523 ; GFX9-W64-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
3524 ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3525 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3526 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
3527 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
3528 ; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2
3529 ; GFX9-W64-NEXT: ; %bb.1: ; %if
3530 ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
3531 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3532 ; GFX9-W64-NEXT: v_readfirstlane_b32 s16, v0
3533 ; GFX9-W64-NEXT: s_buffer_load_dword s16, s[8:11], s16 offset:0x0
3534 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3535 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s16
3536 ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
3537 ; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3538 ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
3539 ; GFX9-W64-NEXT: .LBB59_2: ; %endif
3540 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
3541 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3542 ; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
3543 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3544 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v0
3545 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3546 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
3547 ; GFX9-W64-NEXT: ; return to shader part epilog
3549 ; GFX10-W32-LABEL: short_exact_regions:
3550 ; GFX10-W32: ; %bb.0: ; %main_body
3551 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
3552 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3553 ; GFX10-W32-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
3554 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3555 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
3556 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3557 ; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0
3558 ; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2
3559 ; GFX10-W32-NEXT: ; %bb.1: ; %if
3560 ; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
3561 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3562 ; GFX10-W32-NEXT: v_readfirstlane_b32 s14, v0
3563 ; GFX10-W32-NEXT: s_buffer_load_dword s14, s[8:11], s14 offset:0x0
3564 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3565 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s14
3566 ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
3567 ; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3568 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
3569 ; GFX10-W32-NEXT: .LBB59_2: ; %endif
3570 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
3571 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3572 ; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3573 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3574 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v0
3575 ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3576 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
3577 ; GFX10-W32-NEXT: ; return to shader part epilog
3579 %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3580 %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3581 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3582 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3583 %cc = icmp uge i32 %hi, 16
3584 br i1 %cc, label %endif, label %if
3587 %idx1 = extractelement <4 x i32> %idx0, i64 0
3588 %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3589 %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0)
3591 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> undef, i32 %idx3, i32 0, i32 0, i32 0)
3595 %d = extractelement <4 x float> %tex1, i64 0
3596 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3597 %r0 = extractelement <4 x float> %tex1, i64 1
3598 %r1 = extractelement <4 x float> %tex2, i64 2
3599 %r2 = fadd float %r0, %r1
3600 %out = call float @llvm.amdgcn.wqm.f32(float %r2)
3605 ; Check that exact regions shortening doesn't prevent early WQM exit
3606 define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3607 ; GFX9-W64-LABEL: short_exact_regions_2:
3608 ; GFX9-W64: ; %bb.0: ; %main_body
3609 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
3610 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3611 ; GFX9-W64-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3
3612 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
3613 ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
3614 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3615 ; GFX9-W64-NEXT: image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
3616 ; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
3617 ; GFX9-W64-NEXT: ; kill: killed $vgpr3
3618 ; GFX9-W64-NEXT: ; kill: killed $vgpr1 killed $vgpr2
3619 ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3620 ; GFX9-W64-NEXT: v_readfirstlane_b32 s0, v0
3621 ; GFX9-W64-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3622 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3623 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v5
3624 ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3625 ; GFX9-W64-NEXT: v_add_f32_e32 v0, s0, v0
3626 ; GFX9-W64-NEXT: ; return to shader part epilog
3628 ; GFX10-W32-LABEL: short_exact_regions_2:
3629 ; GFX10-W32: ; %bb.0: ; %main_body
3630 ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
3631 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3632 ; GFX10-W32-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
3633 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
3634 ; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
3635 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3636 ; GFX10-W32-NEXT: image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3637 ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3638 ; GFX10-W32-NEXT: v_readfirstlane_b32 s0, v0
3639 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3640 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v1
3641 ; GFX10-W32-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3642 ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3643 ; GFX10-W32-NEXT: v_add_f32_e32 v0, s0, v0
3644 ; GFX10-W32-NEXT: ; return to shader part epilog
3646 %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3647 %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3648 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3649 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3650 %idx1 = extractelement <4 x i32> %idx0, i64 0
3651 %d = extractelement <4 x float> %tex1, i64 0
3653 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3655 %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3656 %idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0)
3658 %r0 = extractelement <4 x float> %tex1, i64 1
3659 %r1 = extractelement <4 x float> %tex2, i64 2
3660 %r2 = fadd float %r0, %r1
3661 %out = fadd float %r2, %idx3
3666 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
3667 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
3669 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
3670 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
3671 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
3672 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
3673 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
3674 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
3676 declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3677 declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
3678 declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2
3679 declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2
3680 declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3
3681 declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3
3683 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
3684 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3685 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3686 declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3687 declare void @llvm.amdgcn.kill(i1) #1
3688 declare float @llvm.amdgcn.wqm.f32(float) #3
3689 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
3690 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
3691 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
3692 declare float @llvm.amdgcn.wwm.f32(float) #3
3693 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
3694 declare float @llvm.amdgcn.strict.wqm.f32(float) #3
3695 declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
3696 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
3697 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
3698 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
3699 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
3700 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
3701 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3702 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
3703 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
3704 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
3705 declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
3707 attributes #1 = { nounwind }
3708 attributes #2 = { nounwind readonly }
3709 attributes #3 = { nounwind readnone }
3710 attributes #4 = { nounwind readnone convergent }
3711 attributes #5 = { "amdgpu-ps-wqm-outputs" }
3712 attributes #6 = { nounwind "InitialPSInputAddr"="2" }
3713 attributes #7 = { nounwind readnone willreturn }