1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s
6 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
7 ; GFX6-LABEL: gather4_2d:
8 ; GFX6: ; %bb.0: ; %main_body
9 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
10 ; GFX6-NEXT: s_mov_b32 s0, s2
11 ; GFX6-NEXT: s_mov_b32 s1, s3
12 ; GFX6-NEXT: s_mov_b32 s2, s4
13 ; GFX6-NEXT: s_mov_b32 s3, s5
14 ; GFX6-NEXT: s_mov_b32 s4, s6
15 ; GFX6-NEXT: s_mov_b32 s5, s7
16 ; GFX6-NEXT: s_mov_b32 s6, s8
17 ; GFX6-NEXT: s_mov_b32 s7, s9
18 ; GFX6-NEXT: s_mov_b32 s8, s10
19 ; GFX6-NEXT: s_mov_b32 s9, s11
20 ; GFX6-NEXT: s_mov_b32 s10, s12
21 ; GFX6-NEXT: s_mov_b32 s11, s13
22 ; GFX6-NEXT: s_wqm_b64 exec, exec
23 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
24 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
25 ; GFX6-NEXT: s_waitcnt vmcnt(0)
26 ; GFX6-NEXT: ; return to shader part epilog
28 ; GFX10NSA-LABEL: gather4_2d:
29 ; GFX10NSA: ; %bb.0: ; %main_body
30 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
31 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
32 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
33 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
34 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
35 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
36 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
37 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
38 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
39 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
40 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
41 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
42 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
43 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
44 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
45 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
46 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
47 ; GFX10NSA-NEXT: ; return to shader part epilog
49 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
53 define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
54 ; GFX6-LABEL: gather4_2d_tfe:
55 ; GFX6: ; %bb.0: ; %main_body
56 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
57 ; GFX6-NEXT: s_mov_b32 s0, s2
58 ; GFX6-NEXT: s_mov_b32 s1, s3
59 ; GFX6-NEXT: s_mov_b32 s2, s4
60 ; GFX6-NEXT: s_mov_b32 s3, s5
61 ; GFX6-NEXT: s_mov_b32 s4, s6
62 ; GFX6-NEXT: s_mov_b32 s5, s7
63 ; GFX6-NEXT: s_mov_b32 s6, s8
64 ; GFX6-NEXT: s_mov_b32 s7, s9
65 ; GFX6-NEXT: s_mov_b32 s8, s10
66 ; GFX6-NEXT: s_mov_b32 s9, s11
67 ; GFX6-NEXT: s_mov_b32 s10, s12
68 ; GFX6-NEXT: s_mov_b32 s11, s13
69 ; GFX6-NEXT: s_wqm_b64 exec, exec
70 ; GFX6-NEXT: v_mov_b32_e32 v5, v0
71 ; GFX6-NEXT: v_mov_b32_e32 v0, 0
72 ; GFX6-NEXT: v_mov_b32_e32 v6, v1
73 ; GFX6-NEXT: v_mov_b32_e32 v1, v0
74 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
75 ; GFX6-NEXT: v_mov_b32_e32 v3, v0
76 ; GFX6-NEXT: v_mov_b32_e32 v4, v0
77 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
78 ; GFX6-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe
79 ; GFX6-NEXT: s_waitcnt vmcnt(0)
80 ; GFX6-NEXT: ; return to shader part epilog
82 ; GFX10NSA-LABEL: gather4_2d_tfe:
83 ; GFX10NSA: ; %bb.0: ; %main_body
84 ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
85 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
86 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
87 ; GFX10NSA-NEXT: v_mov_b32_e32 v5, v0
88 ; GFX10NSA-NEXT: v_mov_b32_e32 v0, 0
89 ; GFX10NSA-NEXT: v_mov_b32_e32 v6, v1
90 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
91 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
92 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
93 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
94 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
95 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
96 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
97 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
98 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
99 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
100 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
101 ; GFX10NSA-NEXT: v_mov_b32_e32 v1, v0
102 ; GFX10NSA-NEXT: v_mov_b32_e32 v2, v0
103 ; GFX10NSA-NEXT: v_mov_b32_e32 v3, v0
104 ; GFX10NSA-NEXT: v_mov_b32_e32 v4, v0
105 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
106 ; GFX10NSA-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
107 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
108 ; GFX10NSA-NEXT: ; return to shader part epilog
110 %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
111 %r = extractvalue { <4 x float>, i32 } %v, 0
115 define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
116 ; GFX6-LABEL: gather4_cube:
117 ; GFX6: ; %bb.0: ; %main_body
118 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
119 ; GFX6-NEXT: s_mov_b32 s0, s2
120 ; GFX6-NEXT: s_mov_b32 s1, s3
121 ; GFX6-NEXT: s_mov_b32 s2, s4
122 ; GFX6-NEXT: s_mov_b32 s3, s5
123 ; GFX6-NEXT: s_mov_b32 s4, s6
124 ; GFX6-NEXT: s_mov_b32 s5, s7
125 ; GFX6-NEXT: s_mov_b32 s6, s8
126 ; GFX6-NEXT: s_mov_b32 s7, s9
127 ; GFX6-NEXT: s_mov_b32 s8, s10
128 ; GFX6-NEXT: s_mov_b32 s9, s11
129 ; GFX6-NEXT: s_mov_b32 s10, s12
130 ; GFX6-NEXT: s_mov_b32 s11, s13
131 ; GFX6-NEXT: s_wqm_b64 exec, exec
132 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
133 ; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
134 ; GFX6-NEXT: s_waitcnt vmcnt(0)
135 ; GFX6-NEXT: ; return to shader part epilog
137 ; GFX10NSA-LABEL: gather4_cube:
138 ; GFX10NSA: ; %bb.0: ; %main_body
139 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
140 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
141 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
142 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
143 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
144 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
145 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
146 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
147 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
148 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
149 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
150 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
151 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
152 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
153 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
154 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
155 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
156 ; GFX10NSA-NEXT: ; return to shader part epilog
158 %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
162 define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
163 ; GFX6-LABEL: gather4_2darray:
164 ; GFX6: ; %bb.0: ; %main_body
165 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
166 ; GFX6-NEXT: s_mov_b32 s0, s2
167 ; GFX6-NEXT: s_mov_b32 s1, s3
168 ; GFX6-NEXT: s_mov_b32 s2, s4
169 ; GFX6-NEXT: s_mov_b32 s3, s5
170 ; GFX6-NEXT: s_mov_b32 s4, s6
171 ; GFX6-NEXT: s_mov_b32 s5, s7
172 ; GFX6-NEXT: s_mov_b32 s6, s8
173 ; GFX6-NEXT: s_mov_b32 s7, s9
174 ; GFX6-NEXT: s_mov_b32 s8, s10
175 ; GFX6-NEXT: s_mov_b32 s9, s11
176 ; GFX6-NEXT: s_mov_b32 s10, s12
177 ; GFX6-NEXT: s_mov_b32 s11, s13
178 ; GFX6-NEXT: s_wqm_b64 exec, exec
179 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
180 ; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
181 ; GFX6-NEXT: s_waitcnt vmcnt(0)
182 ; GFX6-NEXT: ; return to shader part epilog
184 ; GFX10NSA-LABEL: gather4_2darray:
185 ; GFX10NSA: ; %bb.0: ; %main_body
186 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
187 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
188 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
189 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
190 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
191 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
192 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
193 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
194 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
195 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
196 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
197 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
198 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
199 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
200 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
201 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
202 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
203 ; GFX10NSA-NEXT: ; return to shader part epilog
205 %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
209 define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
210 ; GFX6-LABEL: gather4_c_2d:
211 ; GFX6: ; %bb.0: ; %main_body
212 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
213 ; GFX6-NEXT: s_mov_b32 s0, s2
214 ; GFX6-NEXT: s_mov_b32 s1, s3
215 ; GFX6-NEXT: s_mov_b32 s2, s4
216 ; GFX6-NEXT: s_mov_b32 s3, s5
217 ; GFX6-NEXT: s_mov_b32 s4, s6
218 ; GFX6-NEXT: s_mov_b32 s5, s7
219 ; GFX6-NEXT: s_mov_b32 s6, s8
220 ; GFX6-NEXT: s_mov_b32 s7, s9
221 ; GFX6-NEXT: s_mov_b32 s8, s10
222 ; GFX6-NEXT: s_mov_b32 s9, s11
223 ; GFX6-NEXT: s_mov_b32 s10, s12
224 ; GFX6-NEXT: s_mov_b32 s11, s13
225 ; GFX6-NEXT: s_wqm_b64 exec, exec
226 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
227 ; GFX6-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
228 ; GFX6-NEXT: s_waitcnt vmcnt(0)
229 ; GFX6-NEXT: ; return to shader part epilog
231 ; GFX10NSA-LABEL: gather4_c_2d:
232 ; GFX10NSA: ; %bb.0: ; %main_body
233 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
234 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
235 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
236 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
237 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
238 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
239 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
240 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
241 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
242 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
243 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
244 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
245 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
246 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
247 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
248 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
249 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
250 ; GFX10NSA-NEXT: ; return to shader part epilog
252 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
256 define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
257 ; GFX6-LABEL: gather4_cl_2d:
258 ; GFX6: ; %bb.0: ; %main_body
259 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
260 ; GFX6-NEXT: s_mov_b32 s0, s2
261 ; GFX6-NEXT: s_mov_b32 s1, s3
262 ; GFX6-NEXT: s_mov_b32 s2, s4
263 ; GFX6-NEXT: s_mov_b32 s3, s5
264 ; GFX6-NEXT: s_mov_b32 s4, s6
265 ; GFX6-NEXT: s_mov_b32 s5, s7
266 ; GFX6-NEXT: s_mov_b32 s6, s8
267 ; GFX6-NEXT: s_mov_b32 s7, s9
268 ; GFX6-NEXT: s_mov_b32 s8, s10
269 ; GFX6-NEXT: s_mov_b32 s9, s11
270 ; GFX6-NEXT: s_mov_b32 s10, s12
271 ; GFX6-NEXT: s_mov_b32 s11, s13
272 ; GFX6-NEXT: s_wqm_b64 exec, exec
273 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
274 ; GFX6-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
275 ; GFX6-NEXT: s_waitcnt vmcnt(0)
276 ; GFX6-NEXT: ; return to shader part epilog
278 ; GFX10NSA-LABEL: gather4_cl_2d:
279 ; GFX10NSA: ; %bb.0: ; %main_body
280 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
281 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
282 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
283 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
284 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
285 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
286 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
287 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
288 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
289 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
290 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
291 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
292 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
293 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
294 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
295 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
296 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
297 ; GFX10NSA-NEXT: ; return to shader part epilog
299 %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
303 define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
304 ; GFX6-LABEL: gather4_c_cl_2d:
305 ; GFX6: ; %bb.0: ; %main_body
306 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
307 ; GFX6-NEXT: s_mov_b32 s0, s2
308 ; GFX6-NEXT: s_mov_b32 s1, s3
309 ; GFX6-NEXT: s_mov_b32 s2, s4
310 ; GFX6-NEXT: s_mov_b32 s3, s5
311 ; GFX6-NEXT: s_mov_b32 s4, s6
312 ; GFX6-NEXT: s_mov_b32 s5, s7
313 ; GFX6-NEXT: s_mov_b32 s6, s8
314 ; GFX6-NEXT: s_mov_b32 s7, s9
315 ; GFX6-NEXT: s_mov_b32 s8, s10
316 ; GFX6-NEXT: s_mov_b32 s9, s11
317 ; GFX6-NEXT: s_mov_b32 s10, s12
318 ; GFX6-NEXT: s_mov_b32 s11, s13
319 ; GFX6-NEXT: s_wqm_b64 exec, exec
320 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
321 ; GFX6-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
322 ; GFX6-NEXT: s_waitcnt vmcnt(0)
323 ; GFX6-NEXT: ; return to shader part epilog
325 ; GFX10NSA-LABEL: gather4_c_cl_2d:
326 ; GFX10NSA: ; %bb.0: ; %main_body
327 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
328 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
329 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
330 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
331 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
332 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
333 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
334 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
335 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
336 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
337 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
338 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
339 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
340 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
341 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
342 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
343 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
344 ; GFX10NSA-NEXT: ; return to shader part epilog
346 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
350 define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
351 ; GFX6-LABEL: gather4_b_2d:
352 ; GFX6: ; %bb.0: ; %main_body
353 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
354 ; GFX6-NEXT: s_mov_b32 s0, s2
355 ; GFX6-NEXT: s_mov_b32 s1, s3
356 ; GFX6-NEXT: s_mov_b32 s2, s4
357 ; GFX6-NEXT: s_mov_b32 s3, s5
358 ; GFX6-NEXT: s_mov_b32 s4, s6
359 ; GFX6-NEXT: s_mov_b32 s5, s7
360 ; GFX6-NEXT: s_mov_b32 s6, s8
361 ; GFX6-NEXT: s_mov_b32 s7, s9
362 ; GFX6-NEXT: s_mov_b32 s8, s10
363 ; GFX6-NEXT: s_mov_b32 s9, s11
364 ; GFX6-NEXT: s_mov_b32 s10, s12
365 ; GFX6-NEXT: s_mov_b32 s11, s13
366 ; GFX6-NEXT: s_wqm_b64 exec, exec
367 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
368 ; GFX6-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
369 ; GFX6-NEXT: s_waitcnt vmcnt(0)
370 ; GFX6-NEXT: ; return to shader part epilog
372 ; GFX10NSA-LABEL: gather4_b_2d:
373 ; GFX10NSA: ; %bb.0: ; %main_body
374 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
375 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
376 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
377 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
378 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
379 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
380 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
381 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
382 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
383 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
384 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
385 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
386 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
387 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
388 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
389 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
390 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
391 ; GFX10NSA-NEXT: ; return to shader part epilog
393 %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
397 define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
398 ; GFX6-LABEL: gather4_c_b_2d:
399 ; GFX6: ; %bb.0: ; %main_body
400 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
401 ; GFX6-NEXT: s_mov_b32 s0, s2
402 ; GFX6-NEXT: s_mov_b32 s1, s3
403 ; GFX6-NEXT: s_mov_b32 s2, s4
404 ; GFX6-NEXT: s_mov_b32 s3, s5
405 ; GFX6-NEXT: s_mov_b32 s4, s6
406 ; GFX6-NEXT: s_mov_b32 s5, s7
407 ; GFX6-NEXT: s_mov_b32 s6, s8
408 ; GFX6-NEXT: s_mov_b32 s7, s9
409 ; GFX6-NEXT: s_mov_b32 s8, s10
410 ; GFX6-NEXT: s_mov_b32 s9, s11
411 ; GFX6-NEXT: s_mov_b32 s10, s12
412 ; GFX6-NEXT: s_mov_b32 s11, s13
413 ; GFX6-NEXT: s_wqm_b64 exec, exec
414 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
415 ; GFX6-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
416 ; GFX6-NEXT: s_waitcnt vmcnt(0)
417 ; GFX6-NEXT: ; return to shader part epilog
419 ; GFX10NSA-LABEL: gather4_c_b_2d:
420 ; GFX10NSA: ; %bb.0: ; %main_body
421 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
422 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
423 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
424 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
425 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
426 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
427 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
428 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
429 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
430 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
431 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
432 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
433 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
434 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
435 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
436 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
437 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
438 ; GFX10NSA-NEXT: ; return to shader part epilog
440 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
444 define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
445 ; GFX6-LABEL: gather4_b_cl_2d:
446 ; GFX6: ; %bb.0: ; %main_body
447 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
448 ; GFX6-NEXT: s_mov_b32 s0, s2
449 ; GFX6-NEXT: s_mov_b32 s1, s3
450 ; GFX6-NEXT: s_mov_b32 s2, s4
451 ; GFX6-NEXT: s_mov_b32 s3, s5
452 ; GFX6-NEXT: s_mov_b32 s4, s6
453 ; GFX6-NEXT: s_mov_b32 s5, s7
454 ; GFX6-NEXT: s_mov_b32 s6, s8
455 ; GFX6-NEXT: s_mov_b32 s7, s9
456 ; GFX6-NEXT: s_mov_b32 s8, s10
457 ; GFX6-NEXT: s_mov_b32 s9, s11
458 ; GFX6-NEXT: s_mov_b32 s10, s12
459 ; GFX6-NEXT: s_mov_b32 s11, s13
460 ; GFX6-NEXT: s_wqm_b64 exec, exec
461 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
462 ; GFX6-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
463 ; GFX6-NEXT: s_waitcnt vmcnt(0)
464 ; GFX6-NEXT: ; return to shader part epilog
466 ; GFX10NSA-LABEL: gather4_b_cl_2d:
467 ; GFX10NSA: ; %bb.0: ; %main_body
468 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
469 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
470 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
471 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
472 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
473 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
474 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
475 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
476 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
477 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
478 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
479 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
480 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
481 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
482 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
483 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
484 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
485 ; GFX10NSA-NEXT: ; return to shader part epilog
487 %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
491 define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
492 ; GFX6-LABEL: gather4_c_b_cl_2d:
493 ; GFX6: ; %bb.0: ; %main_body
494 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
495 ; GFX6-NEXT: s_mov_b32 s0, s2
496 ; GFX6-NEXT: s_mov_b32 s1, s3
497 ; GFX6-NEXT: s_mov_b32 s2, s4
498 ; GFX6-NEXT: s_mov_b32 s3, s5
499 ; GFX6-NEXT: s_mov_b32 s4, s6
500 ; GFX6-NEXT: s_mov_b32 s5, s7
501 ; GFX6-NEXT: s_mov_b32 s6, s8
502 ; GFX6-NEXT: s_mov_b32 s7, s9
503 ; GFX6-NEXT: s_mov_b32 s8, s10
504 ; GFX6-NEXT: s_mov_b32 s9, s11
505 ; GFX6-NEXT: s_mov_b32 s10, s12
506 ; GFX6-NEXT: s_mov_b32 s11, s13
507 ; GFX6-NEXT: s_wqm_b64 exec, exec
508 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
509 ; GFX6-NEXT: image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
510 ; GFX6-NEXT: s_waitcnt vmcnt(0)
511 ; GFX6-NEXT: ; return to shader part epilog
513 ; GFX10NSA-LABEL: gather4_c_b_cl_2d:
514 ; GFX10NSA: ; %bb.0: ; %main_body
515 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
516 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
517 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
518 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
519 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
520 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
521 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
522 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
523 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
524 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
525 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
526 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
527 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
528 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
529 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
530 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
531 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
532 ; GFX10NSA-NEXT: ; return to shader part epilog
534 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
538 define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
539 ; GFX6-LABEL: gather4_l_2d:
540 ; GFX6: ; %bb.0: ; %main_body
541 ; GFX6-NEXT: s_mov_b32 s0, s2
542 ; GFX6-NEXT: s_mov_b32 s1, s3
543 ; GFX6-NEXT: s_mov_b32 s2, s4
544 ; GFX6-NEXT: s_mov_b32 s3, s5
545 ; GFX6-NEXT: s_mov_b32 s4, s6
546 ; GFX6-NEXT: s_mov_b32 s5, s7
547 ; GFX6-NEXT: s_mov_b32 s6, s8
548 ; GFX6-NEXT: s_mov_b32 s7, s9
549 ; GFX6-NEXT: s_mov_b32 s8, s10
550 ; GFX6-NEXT: s_mov_b32 s9, s11
551 ; GFX6-NEXT: s_mov_b32 s10, s12
552 ; GFX6-NEXT: s_mov_b32 s11, s13
553 ; GFX6-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
554 ; GFX6-NEXT: s_waitcnt vmcnt(0)
555 ; GFX6-NEXT: ; return to shader part epilog
557 ; GFX10NSA-LABEL: gather4_l_2d:
558 ; GFX10NSA: ; %bb.0: ; %main_body
559 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
560 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
561 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
562 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
563 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
564 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
565 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
566 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
567 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
568 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
569 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
570 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
571 ; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
572 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
573 ; GFX10NSA-NEXT: ; return to shader part epilog
575 %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
579 define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
580 ; GFX6-LABEL: gather4_c_l_2d:
581 ; GFX6: ; %bb.0: ; %main_body
582 ; GFX6-NEXT: s_mov_b32 s0, s2
583 ; GFX6-NEXT: s_mov_b32 s1, s3
584 ; GFX6-NEXT: s_mov_b32 s2, s4
585 ; GFX6-NEXT: s_mov_b32 s3, s5
586 ; GFX6-NEXT: s_mov_b32 s4, s6
587 ; GFX6-NEXT: s_mov_b32 s5, s7
588 ; GFX6-NEXT: s_mov_b32 s6, s8
589 ; GFX6-NEXT: s_mov_b32 s7, s9
590 ; GFX6-NEXT: s_mov_b32 s8, s10
591 ; GFX6-NEXT: s_mov_b32 s9, s11
592 ; GFX6-NEXT: s_mov_b32 s10, s12
593 ; GFX6-NEXT: s_mov_b32 s11, s13
594 ; GFX6-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
595 ; GFX6-NEXT: s_waitcnt vmcnt(0)
596 ; GFX6-NEXT: ; return to shader part epilog
598 ; GFX10NSA-LABEL: gather4_c_l_2d:
599 ; GFX10NSA: ; %bb.0: ; %main_body
600 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
601 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
602 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
603 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
604 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
605 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
606 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
607 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
608 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
609 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
610 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
611 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
612 ; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
613 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
614 ; GFX10NSA-NEXT: ; return to shader part epilog
616 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
620 define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
621 ; GFX6-LABEL: gather4_lz_2d:
622 ; GFX6: ; %bb.0: ; %main_body
623 ; GFX6-NEXT: s_mov_b32 s0, s2
624 ; GFX6-NEXT: s_mov_b32 s1, s3
625 ; GFX6-NEXT: s_mov_b32 s2, s4
626 ; GFX6-NEXT: s_mov_b32 s3, s5
627 ; GFX6-NEXT: s_mov_b32 s4, s6
628 ; GFX6-NEXT: s_mov_b32 s5, s7
629 ; GFX6-NEXT: s_mov_b32 s6, s8
630 ; GFX6-NEXT: s_mov_b32 s7, s9
631 ; GFX6-NEXT: s_mov_b32 s8, s10
632 ; GFX6-NEXT: s_mov_b32 s9, s11
633 ; GFX6-NEXT: s_mov_b32 s10, s12
634 ; GFX6-NEXT: s_mov_b32 s11, s13
635 ; GFX6-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
636 ; GFX6-NEXT: s_waitcnt vmcnt(0)
637 ; GFX6-NEXT: ; return to shader part epilog
639 ; GFX10NSA-LABEL: gather4_lz_2d:
640 ; GFX10NSA: ; %bb.0: ; %main_body
641 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
642 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
643 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
644 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
645 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
646 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
647 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
648 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
649 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
650 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
651 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
652 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
653 ; GFX10NSA-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
654 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
655 ; GFX10NSA-NEXT: ; return to shader part epilog
657 %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
661 define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
662 ; GFX6-LABEL: gather4_c_lz_2d:
663 ; GFX6: ; %bb.0: ; %main_body
664 ; GFX6-NEXT: s_mov_b32 s0, s2
665 ; GFX6-NEXT: s_mov_b32 s1, s3
666 ; GFX6-NEXT: s_mov_b32 s2, s4
667 ; GFX6-NEXT: s_mov_b32 s3, s5
668 ; GFX6-NEXT: s_mov_b32 s4, s6
669 ; GFX6-NEXT: s_mov_b32 s5, s7
670 ; GFX6-NEXT: s_mov_b32 s6, s8
671 ; GFX6-NEXT: s_mov_b32 s7, s9
672 ; GFX6-NEXT: s_mov_b32 s8, s10
673 ; GFX6-NEXT: s_mov_b32 s9, s11
674 ; GFX6-NEXT: s_mov_b32 s10, s12
675 ; GFX6-NEXT: s_mov_b32 s11, s13
676 ; GFX6-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
677 ; GFX6-NEXT: s_waitcnt vmcnt(0)
678 ; GFX6-NEXT: ; return to shader part epilog
680 ; GFX10NSA-LABEL: gather4_c_lz_2d:
681 ; GFX10NSA: ; %bb.0: ; %main_body
682 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
683 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
684 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
685 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
686 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
687 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
688 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
689 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
690 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
691 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
692 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
693 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
694 ; GFX10NSA-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
695 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
696 ; GFX10NSA-NEXT: ; return to shader part epilog
698 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
702 define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
703 ; GFX6-LABEL: gather4_2d_dmask_2:
704 ; GFX6: ; %bb.0: ; %main_body
705 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
706 ; GFX6-NEXT: s_mov_b32 s0, s2
707 ; GFX6-NEXT: s_mov_b32 s1, s3
708 ; GFX6-NEXT: s_mov_b32 s2, s4
709 ; GFX6-NEXT: s_mov_b32 s3, s5
710 ; GFX6-NEXT: s_mov_b32 s4, s6
711 ; GFX6-NEXT: s_mov_b32 s5, s7
712 ; GFX6-NEXT: s_mov_b32 s6, s8
713 ; GFX6-NEXT: s_mov_b32 s7, s9
714 ; GFX6-NEXT: s_mov_b32 s8, s10
715 ; GFX6-NEXT: s_mov_b32 s9, s11
716 ; GFX6-NEXT: s_mov_b32 s10, s12
717 ; GFX6-NEXT: s_mov_b32 s11, s13
718 ; GFX6-NEXT: s_wqm_b64 exec, exec
719 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
720 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2
721 ; GFX6-NEXT: s_waitcnt vmcnt(0)
722 ; GFX6-NEXT: ; return to shader part epilog
724 ; GFX10NSA-LABEL: gather4_2d_dmask_2:
725 ; GFX10NSA: ; %bb.0: ; %main_body
726 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
727 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
728 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
729 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
730 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
731 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
732 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
733 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
734 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
735 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
736 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
737 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
738 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
739 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
740 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
741 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D
742 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
743 ; GFX10NSA-NEXT: ; return to shader part epilog
745 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
749 define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
750 ; GFX6-LABEL: gather4_2d_dmask_4:
751 ; GFX6: ; %bb.0: ; %main_body
752 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
753 ; GFX6-NEXT: s_mov_b32 s0, s2
754 ; GFX6-NEXT: s_mov_b32 s1, s3
755 ; GFX6-NEXT: s_mov_b32 s2, s4
756 ; GFX6-NEXT: s_mov_b32 s3, s5
757 ; GFX6-NEXT: s_mov_b32 s4, s6
758 ; GFX6-NEXT: s_mov_b32 s5, s7
759 ; GFX6-NEXT: s_mov_b32 s6, s8
760 ; GFX6-NEXT: s_mov_b32 s7, s9
761 ; GFX6-NEXT: s_mov_b32 s8, s10
762 ; GFX6-NEXT: s_mov_b32 s9, s11
763 ; GFX6-NEXT: s_mov_b32 s10, s12
764 ; GFX6-NEXT: s_mov_b32 s11, s13
765 ; GFX6-NEXT: s_wqm_b64 exec, exec
766 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
767 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4
768 ; GFX6-NEXT: s_waitcnt vmcnt(0)
769 ; GFX6-NEXT: ; return to shader part epilog
771 ; GFX10NSA-LABEL: gather4_2d_dmask_4:
772 ; GFX10NSA: ; %bb.0: ; %main_body
773 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
774 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
775 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
776 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
777 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
778 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
779 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
780 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
781 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
782 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
783 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
784 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
785 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
786 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
787 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
788 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D
789 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
790 ; GFX10NSA-NEXT: ; return to shader part epilog
792 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
796 define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
797 ; GFX6-LABEL: gather4_2d_dmask_8:
798 ; GFX6: ; %bb.0: ; %main_body
799 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
800 ; GFX6-NEXT: s_mov_b32 s0, s2
801 ; GFX6-NEXT: s_mov_b32 s1, s3
802 ; GFX6-NEXT: s_mov_b32 s2, s4
803 ; GFX6-NEXT: s_mov_b32 s3, s5
804 ; GFX6-NEXT: s_mov_b32 s4, s6
805 ; GFX6-NEXT: s_mov_b32 s5, s7
806 ; GFX6-NEXT: s_mov_b32 s6, s8
807 ; GFX6-NEXT: s_mov_b32 s7, s9
808 ; GFX6-NEXT: s_mov_b32 s8, s10
809 ; GFX6-NEXT: s_mov_b32 s9, s11
810 ; GFX6-NEXT: s_mov_b32 s10, s12
811 ; GFX6-NEXT: s_mov_b32 s11, s13
812 ; GFX6-NEXT: s_wqm_b64 exec, exec
813 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
814 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8
815 ; GFX6-NEXT: s_waitcnt vmcnt(0)
816 ; GFX6-NEXT: ; return to shader part epilog
818 ; GFX10NSA-LABEL: gather4_2d_dmask_8:
819 ; GFX10NSA: ; %bb.0: ; %main_body
820 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
821 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
822 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
823 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
824 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
825 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
826 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
827 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
828 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
829 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
830 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
831 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
832 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
833 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
834 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
835 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D
836 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
837 ; GFX10NSA-NEXT: ; return to shader part epilog
839 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
843 declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
844 declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
845 declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
846 declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
847 declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
848 declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
849 declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
850 declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
851 declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
852 declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
853 declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
854 declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
855 declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
856 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
857 declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
859 attributes #0 = { nounwind readonly }