1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
5 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
6 ; GFX6-LABEL: gather4_2d:
7 ; GFX6: ; %bb.0: ; %main_body
8 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
9 ; GFX6-NEXT: s_mov_b32 s0, s2
10 ; GFX6-NEXT: s_mov_b32 s1, s3
11 ; GFX6-NEXT: s_mov_b32 s2, s4
12 ; GFX6-NEXT: s_mov_b32 s3, s5
13 ; GFX6-NEXT: s_mov_b32 s4, s6
14 ; GFX6-NEXT: s_mov_b32 s5, s7
15 ; GFX6-NEXT: s_mov_b32 s6, s8
16 ; GFX6-NEXT: s_mov_b32 s7, s9
17 ; GFX6-NEXT: s_mov_b32 s8, s10
18 ; GFX6-NEXT: s_mov_b32 s9, s11
19 ; GFX6-NEXT: s_mov_b32 s10, s12
20 ; GFX6-NEXT: s_mov_b32 s11, s13
21 ; GFX6-NEXT: s_wqm_b64 exec, exec
22 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
23 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
24 ; GFX6-NEXT: s_waitcnt vmcnt(0)
25 ; GFX6-NEXT: ; return to shader part epilog
27 ; GFX10NSA-LABEL: gather4_2d:
28 ; GFX10NSA: ; %bb.0: ; %main_body
29 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
30 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
31 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
32 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
33 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
34 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
35 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
36 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
37 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
38 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
39 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
40 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
41 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
42 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
43 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
44 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
45 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
46 ; GFX10NSA-NEXT: ; return to shader part epilog
48 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
52 define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
53 ; GFX6-LABEL: gather4_2d_tfe:
54 ; GFX6: ; %bb.0: ; %main_body
55 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
56 ; GFX6-NEXT: s_mov_b32 s0, s2
57 ; GFX6-NEXT: s_mov_b32 s1, s3
58 ; GFX6-NEXT: s_mov_b32 s2, s4
59 ; GFX6-NEXT: s_mov_b32 s3, s5
60 ; GFX6-NEXT: s_mov_b32 s4, s6
61 ; GFX6-NEXT: s_mov_b32 s5, s7
62 ; GFX6-NEXT: s_mov_b32 s6, s8
63 ; GFX6-NEXT: s_mov_b32 s7, s9
64 ; GFX6-NEXT: s_mov_b32 s8, s10
65 ; GFX6-NEXT: s_mov_b32 s9, s11
66 ; GFX6-NEXT: s_mov_b32 s10, s12
67 ; GFX6-NEXT: s_mov_b32 s11, s13
68 ; GFX6-NEXT: s_wqm_b64 exec, exec
69 ; GFX6-NEXT: v_mov_b32_e32 v5, v0
70 ; GFX6-NEXT: v_mov_b32_e32 v0, 0
71 ; GFX6-NEXT: v_mov_b32_e32 v6, v1
72 ; GFX6-NEXT: v_mov_b32_e32 v1, v0
73 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
74 ; GFX6-NEXT: v_mov_b32_e32 v3, v0
75 ; GFX6-NEXT: v_mov_b32_e32 v4, v0
76 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
77 ; GFX6-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe
78 ; GFX6-NEXT: s_waitcnt vmcnt(0)
79 ; GFX6-NEXT: ; return to shader part epilog
81 ; GFX10NSA-LABEL: gather4_2d_tfe:
82 ; GFX10NSA: ; %bb.0: ; %main_body
83 ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
84 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
85 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
86 ; GFX10NSA-NEXT: v_mov_b32_e32 v5, v0
87 ; GFX10NSA-NEXT: v_mov_b32_e32 v0, 0
88 ; GFX10NSA-NEXT: v_mov_b32_e32 v6, v1
89 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
90 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
91 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
92 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
93 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
94 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
95 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
96 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
97 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
98 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
99 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
100 ; GFX10NSA-NEXT: v_mov_b32_e32 v1, v0
101 ; GFX10NSA-NEXT: v_mov_b32_e32 v2, v0
102 ; GFX10NSA-NEXT: v_mov_b32_e32 v3, v0
103 ; GFX10NSA-NEXT: v_mov_b32_e32 v4, v0
104 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
105 ; GFX10NSA-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
106 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
107 ; GFX10NSA-NEXT: ; return to shader part epilog
109 %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
110 %r = extractvalue { <4 x float>, i32 } %v, 0
114 define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
115 ; GFX6-LABEL: gather4_cube:
116 ; GFX6: ; %bb.0: ; %main_body
117 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
118 ; GFX6-NEXT: s_mov_b32 s0, s2
119 ; GFX6-NEXT: s_mov_b32 s1, s3
120 ; GFX6-NEXT: s_mov_b32 s2, s4
121 ; GFX6-NEXT: s_mov_b32 s3, s5
122 ; GFX6-NEXT: s_mov_b32 s4, s6
123 ; GFX6-NEXT: s_mov_b32 s5, s7
124 ; GFX6-NEXT: s_mov_b32 s6, s8
125 ; GFX6-NEXT: s_mov_b32 s7, s9
126 ; GFX6-NEXT: s_mov_b32 s8, s10
127 ; GFX6-NEXT: s_mov_b32 s9, s11
128 ; GFX6-NEXT: s_mov_b32 s10, s12
129 ; GFX6-NEXT: s_mov_b32 s11, s13
130 ; GFX6-NEXT: s_wqm_b64 exec, exec
131 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
132 ; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
133 ; GFX6-NEXT: s_waitcnt vmcnt(0)
134 ; GFX6-NEXT: ; return to shader part epilog
136 ; GFX10NSA-LABEL: gather4_cube:
137 ; GFX10NSA: ; %bb.0: ; %main_body
138 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
139 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
140 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
141 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
142 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
143 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
144 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
145 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
146 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
147 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
148 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
149 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
150 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
151 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
152 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
153 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
154 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
155 ; GFX10NSA-NEXT: ; return to shader part epilog
157 %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
161 define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
162 ; GFX6-LABEL: gather4_2darray:
163 ; GFX6: ; %bb.0: ; %main_body
164 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
165 ; GFX6-NEXT: s_mov_b32 s0, s2
166 ; GFX6-NEXT: s_mov_b32 s1, s3
167 ; GFX6-NEXT: s_mov_b32 s2, s4
168 ; GFX6-NEXT: s_mov_b32 s3, s5
169 ; GFX6-NEXT: s_mov_b32 s4, s6
170 ; GFX6-NEXT: s_mov_b32 s5, s7
171 ; GFX6-NEXT: s_mov_b32 s6, s8
172 ; GFX6-NEXT: s_mov_b32 s7, s9
173 ; GFX6-NEXT: s_mov_b32 s8, s10
174 ; GFX6-NEXT: s_mov_b32 s9, s11
175 ; GFX6-NEXT: s_mov_b32 s10, s12
176 ; GFX6-NEXT: s_mov_b32 s11, s13
177 ; GFX6-NEXT: s_wqm_b64 exec, exec
178 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
179 ; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
180 ; GFX6-NEXT: s_waitcnt vmcnt(0)
181 ; GFX6-NEXT: ; return to shader part epilog
183 ; GFX10NSA-LABEL: gather4_2darray:
184 ; GFX10NSA: ; %bb.0: ; %main_body
185 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
186 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
187 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
188 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
189 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
190 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
191 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
192 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
193 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
194 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
195 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
196 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
197 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
198 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
199 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
200 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
201 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
202 ; GFX10NSA-NEXT: ; return to shader part epilog
204 %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
208 define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
209 ; GFX6-LABEL: gather4_c_2d:
210 ; GFX6: ; %bb.0: ; %main_body
211 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
212 ; GFX6-NEXT: s_mov_b32 s0, s2
213 ; GFX6-NEXT: s_mov_b32 s1, s3
214 ; GFX6-NEXT: s_mov_b32 s2, s4
215 ; GFX6-NEXT: s_mov_b32 s3, s5
216 ; GFX6-NEXT: s_mov_b32 s4, s6
217 ; GFX6-NEXT: s_mov_b32 s5, s7
218 ; GFX6-NEXT: s_mov_b32 s6, s8
219 ; GFX6-NEXT: s_mov_b32 s7, s9
220 ; GFX6-NEXT: s_mov_b32 s8, s10
221 ; GFX6-NEXT: s_mov_b32 s9, s11
222 ; GFX6-NEXT: s_mov_b32 s10, s12
223 ; GFX6-NEXT: s_mov_b32 s11, s13
224 ; GFX6-NEXT: s_wqm_b64 exec, exec
225 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
226 ; GFX6-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
227 ; GFX6-NEXT: s_waitcnt vmcnt(0)
228 ; GFX6-NEXT: ; return to shader part epilog
230 ; GFX10NSA-LABEL: gather4_c_2d:
231 ; GFX10NSA: ; %bb.0: ; %main_body
232 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
233 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
234 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
235 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
236 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
237 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
238 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
239 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
240 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
241 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
242 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
243 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
244 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
245 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
246 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
247 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
248 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
249 ; GFX10NSA-NEXT: ; return to shader part epilog
251 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
255 define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
256 ; GFX6-LABEL: gather4_cl_2d:
257 ; GFX6: ; %bb.0: ; %main_body
258 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
259 ; GFX6-NEXT: s_mov_b32 s0, s2
260 ; GFX6-NEXT: s_mov_b32 s1, s3
261 ; GFX6-NEXT: s_mov_b32 s2, s4
262 ; GFX6-NEXT: s_mov_b32 s3, s5
263 ; GFX6-NEXT: s_mov_b32 s4, s6
264 ; GFX6-NEXT: s_mov_b32 s5, s7
265 ; GFX6-NEXT: s_mov_b32 s6, s8
266 ; GFX6-NEXT: s_mov_b32 s7, s9
267 ; GFX6-NEXT: s_mov_b32 s8, s10
268 ; GFX6-NEXT: s_mov_b32 s9, s11
269 ; GFX6-NEXT: s_mov_b32 s10, s12
270 ; GFX6-NEXT: s_mov_b32 s11, s13
271 ; GFX6-NEXT: s_wqm_b64 exec, exec
272 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
273 ; GFX6-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
274 ; GFX6-NEXT: s_waitcnt vmcnt(0)
275 ; GFX6-NEXT: ; return to shader part epilog
277 ; GFX10NSA-LABEL: gather4_cl_2d:
278 ; GFX10NSA: ; %bb.0: ; %main_body
279 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
280 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
281 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
282 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
283 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
284 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
285 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
286 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
287 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
288 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
289 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
290 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
291 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
292 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
293 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
294 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
295 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
296 ; GFX10NSA-NEXT: ; return to shader part epilog
298 %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
302 define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
303 ; GFX6-LABEL: gather4_c_cl_2d:
304 ; GFX6: ; %bb.0: ; %main_body
305 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
306 ; GFX6-NEXT: s_mov_b32 s0, s2
307 ; GFX6-NEXT: s_mov_b32 s1, s3
308 ; GFX6-NEXT: s_mov_b32 s2, s4
309 ; GFX6-NEXT: s_mov_b32 s3, s5
310 ; GFX6-NEXT: s_mov_b32 s4, s6
311 ; GFX6-NEXT: s_mov_b32 s5, s7
312 ; GFX6-NEXT: s_mov_b32 s6, s8
313 ; GFX6-NEXT: s_mov_b32 s7, s9
314 ; GFX6-NEXT: s_mov_b32 s8, s10
315 ; GFX6-NEXT: s_mov_b32 s9, s11
316 ; GFX6-NEXT: s_mov_b32 s10, s12
317 ; GFX6-NEXT: s_mov_b32 s11, s13
318 ; GFX6-NEXT: s_wqm_b64 exec, exec
319 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
320 ; GFX6-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
321 ; GFX6-NEXT: s_waitcnt vmcnt(0)
322 ; GFX6-NEXT: ; return to shader part epilog
324 ; GFX10NSA-LABEL: gather4_c_cl_2d:
325 ; GFX10NSA: ; %bb.0: ; %main_body
326 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
327 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
328 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
329 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
330 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
331 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
332 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
333 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
334 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
335 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
336 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
337 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
338 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
339 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
340 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
341 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
342 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
343 ; GFX10NSA-NEXT: ; return to shader part epilog
345 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
349 define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
350 ; GFX6-LABEL: gather4_b_2d:
351 ; GFX6: ; %bb.0: ; %main_body
352 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
353 ; GFX6-NEXT: s_mov_b32 s0, s2
354 ; GFX6-NEXT: s_mov_b32 s1, s3
355 ; GFX6-NEXT: s_mov_b32 s2, s4
356 ; GFX6-NEXT: s_mov_b32 s3, s5
357 ; GFX6-NEXT: s_mov_b32 s4, s6
358 ; GFX6-NEXT: s_mov_b32 s5, s7
359 ; GFX6-NEXT: s_mov_b32 s6, s8
360 ; GFX6-NEXT: s_mov_b32 s7, s9
361 ; GFX6-NEXT: s_mov_b32 s8, s10
362 ; GFX6-NEXT: s_mov_b32 s9, s11
363 ; GFX6-NEXT: s_mov_b32 s10, s12
364 ; GFX6-NEXT: s_mov_b32 s11, s13
365 ; GFX6-NEXT: s_wqm_b64 exec, exec
366 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
367 ; GFX6-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
368 ; GFX6-NEXT: s_waitcnt vmcnt(0)
369 ; GFX6-NEXT: ; return to shader part epilog
371 ; GFX10NSA-LABEL: gather4_b_2d:
372 ; GFX10NSA: ; %bb.0: ; %main_body
373 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
374 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
375 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
376 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
377 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
378 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
379 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
380 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
381 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
382 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
383 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
384 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
385 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
386 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
387 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
388 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
389 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
390 ; GFX10NSA-NEXT: ; return to shader part epilog
392 %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
396 define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
397 ; GFX6-LABEL: gather4_c_b_2d:
398 ; GFX6: ; %bb.0: ; %main_body
399 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
400 ; GFX6-NEXT: s_mov_b32 s0, s2
401 ; GFX6-NEXT: s_mov_b32 s1, s3
402 ; GFX6-NEXT: s_mov_b32 s2, s4
403 ; GFX6-NEXT: s_mov_b32 s3, s5
404 ; GFX6-NEXT: s_mov_b32 s4, s6
405 ; GFX6-NEXT: s_mov_b32 s5, s7
406 ; GFX6-NEXT: s_mov_b32 s6, s8
407 ; GFX6-NEXT: s_mov_b32 s7, s9
408 ; GFX6-NEXT: s_mov_b32 s8, s10
409 ; GFX6-NEXT: s_mov_b32 s9, s11
410 ; GFX6-NEXT: s_mov_b32 s10, s12
411 ; GFX6-NEXT: s_mov_b32 s11, s13
412 ; GFX6-NEXT: s_wqm_b64 exec, exec
413 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
414 ; GFX6-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
415 ; GFX6-NEXT: s_waitcnt vmcnt(0)
416 ; GFX6-NEXT: ; return to shader part epilog
418 ; GFX10NSA-LABEL: gather4_c_b_2d:
419 ; GFX10NSA: ; %bb.0: ; %main_body
420 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
421 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
422 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
423 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
424 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
425 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
426 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
427 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
428 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
429 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
430 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
431 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
432 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
433 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
434 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
435 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
436 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
437 ; GFX10NSA-NEXT: ; return to shader part epilog
439 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
443 define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
444 ; GFX6-LABEL: gather4_b_cl_2d:
445 ; GFX6: ; %bb.0: ; %main_body
446 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
447 ; GFX6-NEXT: s_mov_b32 s0, s2
448 ; GFX6-NEXT: s_mov_b32 s1, s3
449 ; GFX6-NEXT: s_mov_b32 s2, s4
450 ; GFX6-NEXT: s_mov_b32 s3, s5
451 ; GFX6-NEXT: s_mov_b32 s4, s6
452 ; GFX6-NEXT: s_mov_b32 s5, s7
453 ; GFX6-NEXT: s_mov_b32 s6, s8
454 ; GFX6-NEXT: s_mov_b32 s7, s9
455 ; GFX6-NEXT: s_mov_b32 s8, s10
456 ; GFX6-NEXT: s_mov_b32 s9, s11
457 ; GFX6-NEXT: s_mov_b32 s10, s12
458 ; GFX6-NEXT: s_mov_b32 s11, s13
459 ; GFX6-NEXT: s_wqm_b64 exec, exec
460 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
461 ; GFX6-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
462 ; GFX6-NEXT: s_waitcnt vmcnt(0)
463 ; GFX6-NEXT: ; return to shader part epilog
465 ; GFX10NSA-LABEL: gather4_b_cl_2d:
466 ; GFX10NSA: ; %bb.0: ; %main_body
467 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
468 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
469 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
470 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
471 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
472 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
473 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
474 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
475 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
476 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
477 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
478 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
479 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
480 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
481 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
482 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
483 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
484 ; GFX10NSA-NEXT: ; return to shader part epilog
486 %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
490 define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
491 ; GFX6-LABEL: gather4_c_b_cl_2d:
492 ; GFX6: ; %bb.0: ; %main_body
493 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
494 ; GFX6-NEXT: s_mov_b32 s0, s2
495 ; GFX6-NEXT: s_mov_b32 s1, s3
496 ; GFX6-NEXT: s_mov_b32 s2, s4
497 ; GFX6-NEXT: s_mov_b32 s3, s5
498 ; GFX6-NEXT: s_mov_b32 s4, s6
499 ; GFX6-NEXT: s_mov_b32 s5, s7
500 ; GFX6-NEXT: s_mov_b32 s6, s8
501 ; GFX6-NEXT: s_mov_b32 s7, s9
502 ; GFX6-NEXT: s_mov_b32 s8, s10
503 ; GFX6-NEXT: s_mov_b32 s9, s11
504 ; GFX6-NEXT: s_mov_b32 s10, s12
505 ; GFX6-NEXT: s_mov_b32 s11, s13
506 ; GFX6-NEXT: s_wqm_b64 exec, exec
507 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
508 ; GFX6-NEXT: image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
509 ; GFX6-NEXT: s_waitcnt vmcnt(0)
510 ; GFX6-NEXT: ; return to shader part epilog
512 ; GFX10NSA-LABEL: gather4_c_b_cl_2d:
513 ; GFX10NSA: ; %bb.0: ; %main_body
514 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
515 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
516 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
517 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
518 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
519 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
520 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
521 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
522 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
523 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
524 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
525 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
526 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
527 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
528 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
529 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
530 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
531 ; GFX10NSA-NEXT: ; return to shader part epilog
533 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
537 define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
538 ; GFX6-LABEL: gather4_l_2d:
539 ; GFX6: ; %bb.0: ; %main_body
540 ; GFX6-NEXT: s_mov_b32 s0, s2
541 ; GFX6-NEXT: s_mov_b32 s1, s3
542 ; GFX6-NEXT: s_mov_b32 s2, s4
543 ; GFX6-NEXT: s_mov_b32 s3, s5
544 ; GFX6-NEXT: s_mov_b32 s4, s6
545 ; GFX6-NEXT: s_mov_b32 s5, s7
546 ; GFX6-NEXT: s_mov_b32 s6, s8
547 ; GFX6-NEXT: s_mov_b32 s7, s9
548 ; GFX6-NEXT: s_mov_b32 s8, s10
549 ; GFX6-NEXT: s_mov_b32 s9, s11
550 ; GFX6-NEXT: s_mov_b32 s10, s12
551 ; GFX6-NEXT: s_mov_b32 s11, s13
552 ; GFX6-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
553 ; GFX6-NEXT: s_waitcnt vmcnt(0)
554 ; GFX6-NEXT: ; return to shader part epilog
556 ; GFX10NSA-LABEL: gather4_l_2d:
557 ; GFX10NSA: ; %bb.0: ; %main_body
558 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
559 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
560 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
561 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
562 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
563 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
564 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
565 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
566 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
567 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
568 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
569 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
570 ; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
571 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
572 ; GFX10NSA-NEXT: ; return to shader part epilog
574 %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
578 define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
579 ; GFX6-LABEL: gather4_c_l_2d:
580 ; GFX6: ; %bb.0: ; %main_body
581 ; GFX6-NEXT: s_mov_b32 s0, s2
582 ; GFX6-NEXT: s_mov_b32 s1, s3
583 ; GFX6-NEXT: s_mov_b32 s2, s4
584 ; GFX6-NEXT: s_mov_b32 s3, s5
585 ; GFX6-NEXT: s_mov_b32 s4, s6
586 ; GFX6-NEXT: s_mov_b32 s5, s7
587 ; GFX6-NEXT: s_mov_b32 s6, s8
588 ; GFX6-NEXT: s_mov_b32 s7, s9
589 ; GFX6-NEXT: s_mov_b32 s8, s10
590 ; GFX6-NEXT: s_mov_b32 s9, s11
591 ; GFX6-NEXT: s_mov_b32 s10, s12
592 ; GFX6-NEXT: s_mov_b32 s11, s13
593 ; GFX6-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
594 ; GFX6-NEXT: s_waitcnt vmcnt(0)
595 ; GFX6-NEXT: ; return to shader part epilog
597 ; GFX10NSA-LABEL: gather4_c_l_2d:
598 ; GFX10NSA: ; %bb.0: ; %main_body
599 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
600 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
601 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
602 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
603 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
604 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
605 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
606 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
607 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
608 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
609 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
610 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
611 ; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
612 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
613 ; GFX10NSA-NEXT: ; return to shader part epilog
615 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
619 define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
620 ; GFX6-LABEL: gather4_lz_2d:
621 ; GFX6: ; %bb.0: ; %main_body
622 ; GFX6-NEXT: s_mov_b32 s0, s2
623 ; GFX6-NEXT: s_mov_b32 s1, s3
624 ; GFX6-NEXT: s_mov_b32 s2, s4
625 ; GFX6-NEXT: s_mov_b32 s3, s5
626 ; GFX6-NEXT: s_mov_b32 s4, s6
627 ; GFX6-NEXT: s_mov_b32 s5, s7
628 ; GFX6-NEXT: s_mov_b32 s6, s8
629 ; GFX6-NEXT: s_mov_b32 s7, s9
630 ; GFX6-NEXT: s_mov_b32 s8, s10
631 ; GFX6-NEXT: s_mov_b32 s9, s11
632 ; GFX6-NEXT: s_mov_b32 s10, s12
633 ; GFX6-NEXT: s_mov_b32 s11, s13
634 ; GFX6-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
635 ; GFX6-NEXT: s_waitcnt vmcnt(0)
636 ; GFX6-NEXT: ; return to shader part epilog
638 ; GFX10NSA-LABEL: gather4_lz_2d:
639 ; GFX10NSA: ; %bb.0: ; %main_body
640 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
641 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
642 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
643 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
644 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
645 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
646 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
647 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
648 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
649 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
650 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
651 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
652 ; GFX10NSA-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
653 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
654 ; GFX10NSA-NEXT: ; return to shader part epilog
656 %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
660 define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
661 ; GFX6-LABEL: gather4_c_lz_2d:
662 ; GFX6: ; %bb.0: ; %main_body
663 ; GFX6-NEXT: s_mov_b32 s0, s2
664 ; GFX6-NEXT: s_mov_b32 s1, s3
665 ; GFX6-NEXT: s_mov_b32 s2, s4
666 ; GFX6-NEXT: s_mov_b32 s3, s5
667 ; GFX6-NEXT: s_mov_b32 s4, s6
668 ; GFX6-NEXT: s_mov_b32 s5, s7
669 ; GFX6-NEXT: s_mov_b32 s6, s8
670 ; GFX6-NEXT: s_mov_b32 s7, s9
671 ; GFX6-NEXT: s_mov_b32 s8, s10
672 ; GFX6-NEXT: s_mov_b32 s9, s11
673 ; GFX6-NEXT: s_mov_b32 s10, s12
674 ; GFX6-NEXT: s_mov_b32 s11, s13
675 ; GFX6-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
676 ; GFX6-NEXT: s_waitcnt vmcnt(0)
677 ; GFX6-NEXT: ; return to shader part epilog
679 ; GFX10NSA-LABEL: gather4_c_lz_2d:
680 ; GFX10NSA: ; %bb.0: ; %main_body
681 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
682 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
683 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
684 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
685 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
686 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
687 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
688 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
689 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
690 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
691 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
692 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
693 ; GFX10NSA-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
694 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
695 ; GFX10NSA-NEXT: ; return to shader part epilog
697 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
701 define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
702 ; GFX6-LABEL: gather4_2d_dmask_2:
703 ; GFX6: ; %bb.0: ; %main_body
704 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
705 ; GFX6-NEXT: s_mov_b32 s0, s2
706 ; GFX6-NEXT: s_mov_b32 s1, s3
707 ; GFX6-NEXT: s_mov_b32 s2, s4
708 ; GFX6-NEXT: s_mov_b32 s3, s5
709 ; GFX6-NEXT: s_mov_b32 s4, s6
710 ; GFX6-NEXT: s_mov_b32 s5, s7
711 ; GFX6-NEXT: s_mov_b32 s6, s8
712 ; GFX6-NEXT: s_mov_b32 s7, s9
713 ; GFX6-NEXT: s_mov_b32 s8, s10
714 ; GFX6-NEXT: s_mov_b32 s9, s11
715 ; GFX6-NEXT: s_mov_b32 s10, s12
716 ; GFX6-NEXT: s_mov_b32 s11, s13
717 ; GFX6-NEXT: s_wqm_b64 exec, exec
718 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
719 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2
720 ; GFX6-NEXT: s_waitcnt vmcnt(0)
721 ; GFX6-NEXT: ; return to shader part epilog
723 ; GFX10NSA-LABEL: gather4_2d_dmask_2:
724 ; GFX10NSA: ; %bb.0: ; %main_body
725 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
726 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
727 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
728 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
729 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
730 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
731 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
732 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
733 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
734 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
735 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
736 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
737 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
738 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
739 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
740 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D
741 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
742 ; GFX10NSA-NEXT: ; return to shader part epilog
744 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
748 define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
749 ; GFX6-LABEL: gather4_2d_dmask_4:
750 ; GFX6: ; %bb.0: ; %main_body
751 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
752 ; GFX6-NEXT: s_mov_b32 s0, s2
753 ; GFX6-NEXT: s_mov_b32 s1, s3
754 ; GFX6-NEXT: s_mov_b32 s2, s4
755 ; GFX6-NEXT: s_mov_b32 s3, s5
756 ; GFX6-NEXT: s_mov_b32 s4, s6
757 ; GFX6-NEXT: s_mov_b32 s5, s7
758 ; GFX6-NEXT: s_mov_b32 s6, s8
759 ; GFX6-NEXT: s_mov_b32 s7, s9
760 ; GFX6-NEXT: s_mov_b32 s8, s10
761 ; GFX6-NEXT: s_mov_b32 s9, s11
762 ; GFX6-NEXT: s_mov_b32 s10, s12
763 ; GFX6-NEXT: s_mov_b32 s11, s13
764 ; GFX6-NEXT: s_wqm_b64 exec, exec
765 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
766 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4
767 ; GFX6-NEXT: s_waitcnt vmcnt(0)
768 ; GFX6-NEXT: ; return to shader part epilog
770 ; GFX10NSA-LABEL: gather4_2d_dmask_4:
771 ; GFX10NSA: ; %bb.0: ; %main_body
772 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
773 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
774 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
775 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
776 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
777 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
778 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
779 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
780 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
781 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
782 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
783 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
784 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
785 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
786 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
787 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D
788 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
789 ; GFX10NSA-NEXT: ; return to shader part epilog
791 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
795 define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
796 ; GFX6-LABEL: gather4_2d_dmask_8:
797 ; GFX6: ; %bb.0: ; %main_body
798 ; GFX6-NEXT: s_mov_b64 s[14:15], exec
799 ; GFX6-NEXT: s_mov_b32 s0, s2
800 ; GFX6-NEXT: s_mov_b32 s1, s3
801 ; GFX6-NEXT: s_mov_b32 s2, s4
802 ; GFX6-NEXT: s_mov_b32 s3, s5
803 ; GFX6-NEXT: s_mov_b32 s4, s6
804 ; GFX6-NEXT: s_mov_b32 s5, s7
805 ; GFX6-NEXT: s_mov_b32 s6, s8
806 ; GFX6-NEXT: s_mov_b32 s7, s9
807 ; GFX6-NEXT: s_mov_b32 s8, s10
808 ; GFX6-NEXT: s_mov_b32 s9, s11
809 ; GFX6-NEXT: s_mov_b32 s10, s12
810 ; GFX6-NEXT: s_mov_b32 s11, s13
811 ; GFX6-NEXT: s_wqm_b64 exec, exec
812 ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
813 ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8
814 ; GFX6-NEXT: s_waitcnt vmcnt(0)
815 ; GFX6-NEXT: ; return to shader part epilog
817 ; GFX10NSA-LABEL: gather4_2d_dmask_8:
818 ; GFX10NSA: ; %bb.0: ; %main_body
819 ; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo
820 ; GFX10NSA-NEXT: s_mov_b32 s0, s2
821 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
822 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1
823 ; GFX10NSA-NEXT: s_mov_b32 s1, s3
824 ; GFX10NSA-NEXT: s_mov_b32 s2, s4
825 ; GFX10NSA-NEXT: s_mov_b32 s3, s5
826 ; GFX10NSA-NEXT: s_mov_b32 s4, s6
827 ; GFX10NSA-NEXT: s_mov_b32 s5, s7
828 ; GFX10NSA-NEXT: s_mov_b32 s6, s8
829 ; GFX10NSA-NEXT: s_mov_b32 s7, s9
830 ; GFX10NSA-NEXT: s_mov_b32 s8, s10
831 ; GFX10NSA-NEXT: s_mov_b32 s9, s11
832 ; GFX10NSA-NEXT: s_mov_b32 s10, s12
833 ; GFX10NSA-NEXT: s_mov_b32 s11, s13
834 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D
835 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
836 ; GFX10NSA-NEXT: ; return to shader part epilog
838 %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
842 declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
843 declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
844 declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
845 declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
846 declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
847 declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
848 declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
849 declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
850 declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
851 declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
852 declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
853 declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
854 declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
855 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
856 declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
858 attributes #0 = { nounwind readonly }