1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX89,VI
4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -| FileCheck %s --check-prefixes=GFX89,GFX9
6 ; XXX - Why the packing?
7 define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
8 ; SI-LABEL: scalar_to_vector_v2i32:
10 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
12 ; SI-NEXT: s_mov_b32 s6, -1
13 ; SI-NEXT: s_mov_b32 s10, s6
14 ; SI-NEXT: s_mov_b32 s11, s7
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s8, s2
17 ; SI-NEXT: s_mov_b32 s9, s3
18 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
19 ; SI-NEXT: s_waitcnt vmcnt(0)
20 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
21 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
22 ; SI-NEXT: s_mov_b32 s4, s0
23 ; SI-NEXT: s_mov_b32 s5, s1
24 ; SI-NEXT: v_mov_b32_e32 v1, v0
25 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
28 ; VI-LABEL: scalar_to_vector_v2i32:
30 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
31 ; VI-NEXT: s_mov_b32 s7, 0xf000
32 ; VI-NEXT: s_mov_b32 s6, -1
33 ; VI-NEXT: s_mov_b32 s10, s6
34 ; VI-NEXT: s_mov_b32 s11, s7
35 ; VI-NEXT: s_waitcnt lgkmcnt(0)
36 ; VI-NEXT: s_mov_b32 s8, s2
37 ; VI-NEXT: s_mov_b32 s9, s3
38 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
39 ; VI-NEXT: s_mov_b32 s4, s0
40 ; VI-NEXT: s_mov_b32 s5, s1
41 ; VI-NEXT: s_waitcnt vmcnt(0)
42 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
43 ; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
44 ; VI-NEXT: v_mov_b32_e32 v1, v0
45 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
48 ; GFX9-LABEL: scalar_to_vector_v2i32:
50 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
51 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
52 ; GFX9-NEXT: s_mov_b32 s6, -1
53 ; GFX9-NEXT: s_mov_b32 s10, s6
54 ; GFX9-NEXT: s_mov_b32 s11, s7
55 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX9-NEXT: s_mov_b32 s8, s2
57 ; GFX9-NEXT: s_mov_b32 s9, s3
58 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
59 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000
60 ; GFX9-NEXT: s_mov_b32 s4, s0
61 ; GFX9-NEXT: s_mov_b32 s5, s1
62 ; GFX9-NEXT: s_waitcnt vmcnt(0)
63 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
64 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
65 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
66 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
68 %tmp1 = load i32, ptr addrspace(1) %in, align 4
69 %bc = bitcast i32 %tmp1 to <2 x i16>
70 %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
71 store <4 x i16> %tmp2, ptr addrspace(1) %out, align 8
75 define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
76 ; SI-LABEL: scalar_to_vector_v2f32:
78 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
79 ; SI-NEXT: s_mov_b32 s7, 0xf000
80 ; SI-NEXT: s_mov_b32 s6, -1
81 ; SI-NEXT: s_mov_b32 s10, s6
82 ; SI-NEXT: s_mov_b32 s11, s7
83 ; SI-NEXT: s_waitcnt lgkmcnt(0)
84 ; SI-NEXT: s_mov_b32 s8, s2
85 ; SI-NEXT: s_mov_b32 s9, s3
86 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
87 ; SI-NEXT: s_waitcnt vmcnt(0)
88 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
89 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
90 ; SI-NEXT: s_mov_b32 s4, s0
91 ; SI-NEXT: s_mov_b32 s5, s1
92 ; SI-NEXT: v_mov_b32_e32 v1, v0
93 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
96 ; VI-LABEL: scalar_to_vector_v2f32:
98 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
99 ; VI-NEXT: s_mov_b32 s7, 0xf000
100 ; VI-NEXT: s_mov_b32 s6, -1
101 ; VI-NEXT: s_mov_b32 s10, s6
102 ; VI-NEXT: s_mov_b32 s11, s7
103 ; VI-NEXT: s_waitcnt lgkmcnt(0)
104 ; VI-NEXT: s_mov_b32 s8, s2
105 ; VI-NEXT: s_mov_b32 s9, s3
106 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
107 ; VI-NEXT: s_mov_b32 s4, s0
108 ; VI-NEXT: s_mov_b32 s5, s1
109 ; VI-NEXT: s_waitcnt vmcnt(0)
110 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
111 ; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
112 ; VI-NEXT: v_mov_b32_e32 v1, v0
113 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
116 ; GFX9-LABEL: scalar_to_vector_v2f32:
118 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
119 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
120 ; GFX9-NEXT: s_mov_b32 s6, -1
121 ; GFX9-NEXT: s_mov_b32 s10, s6
122 ; GFX9-NEXT: s_mov_b32 s11, s7
123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
124 ; GFX9-NEXT: s_mov_b32 s8, s2
125 ; GFX9-NEXT: s_mov_b32 s9, s3
126 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
127 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000
128 ; GFX9-NEXT: s_mov_b32 s4, s0
129 ; GFX9-NEXT: s_mov_b32 s5, s1
130 ; GFX9-NEXT: s_waitcnt vmcnt(0)
131 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
132 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
133 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
134 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
135 ; GFX9-NEXT: s_endpgm
136 %tmp1 = load float, ptr addrspace(1) %in, align 4
137 %bc = bitcast float %tmp1 to <2 x i16>
138 %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
139 store <4 x i16> %tmp2, ptr addrspace(1) %out, align 8
143 define amdgpu_kernel void @scalar_to_vector_v4i16() {
144 ; SI-LABEL: scalar_to_vector_v4i16:
146 ; SI-NEXT: s_mov_b32 s3, 0xf000
147 ; SI-NEXT: s_mov_b32 s2, -1
148 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
149 ; SI-NEXT: s_waitcnt vmcnt(0)
150 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
151 ; SI-NEXT: v_or_b32_e32 v2, v1, v0
152 ; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2
153 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
154 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
155 ; SI-NEXT: v_or_b32_e32 v1, v0, v3
156 ; SI-NEXT: v_or_b32_e32 v0, v2, v3
157 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
160 ; VI-LABEL: scalar_to_vector_v4i16:
162 ; VI-NEXT: s_mov_b32 s3, 0xf000
163 ; VI-NEXT: s_mov_b32 s2, -1
164 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
165 ; VI-NEXT: s_waitcnt vmcnt(0)
166 ; VI-NEXT: v_readfirstlane_b32 s0, v0
167 ; VI-NEXT: s_lshl_b32 s1, s0, 8
168 ; VI-NEXT: s_or_b32 s0, s0, s1
169 ; VI-NEXT: s_lshl_b32 s1, s0, 16
170 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
171 ; VI-NEXT: s_or_b32 s0, s0, s1
172 ; VI-NEXT: v_mov_b32_e32 v0, s0
173 ; VI-NEXT: v_mov_b32_e32 v1, s0
174 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
177 ; GFX9-LABEL: scalar_to_vector_v4i16:
178 ; GFX9: ; %bb.0: ; %bb
179 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
180 ; GFX9-NEXT: s_mov_b32 s2, -1
181 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
182 ; GFX9-NEXT: s_waitcnt vmcnt(0)
183 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
184 ; GFX9-NEXT: s_lshl_b32 s1, s0, 8
185 ; GFX9-NEXT: s_or_b32 s0, s0, s1
186 ; GFX9-NEXT: s_and_b32 s1, s0, 0xffff
187 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16
188 ; GFX9-NEXT: s_or_b32 s0, s1, s0
189 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
190 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
191 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
192 ; GFX9-NEXT: s_endpgm
194 %tmp = load <2 x i8>, ptr addrspace(1) undef, align 1
195 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
196 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
197 store <8 x i8> %tmp2, ptr addrspace(1) undef, align 8
201 define amdgpu_kernel void @scalar_to_vector_v4f16() {
202 ; SI-LABEL: scalar_to_vector_v4f16:
204 ; SI-NEXT: s_mov_b32 s3, 0xf000
205 ; SI-NEXT: s_mov_b32 s2, -1
206 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
207 ; SI-NEXT: s_waitcnt vmcnt(0)
208 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
209 ; SI-NEXT: v_or_b32_e32 v2, v1, v0
210 ; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2
211 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
212 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
213 ; SI-NEXT: v_or_b32_e32 v1, v0, v3
214 ; SI-NEXT: v_or_b32_e32 v0, v2, v3
215 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
218 ; VI-LABEL: scalar_to_vector_v4f16:
220 ; VI-NEXT: s_mov_b32 s3, 0xf000
221 ; VI-NEXT: s_mov_b32 s2, -1
222 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
223 ; VI-NEXT: s_waitcnt vmcnt(0)
224 ; VI-NEXT: v_readfirstlane_b32 s0, v0
225 ; VI-NEXT: s_lshl_b32 s1, s0, 8
226 ; VI-NEXT: s_or_b32 s0, s1, s0
227 ; VI-NEXT: s_and_b32 s1, s0, 0xff00
228 ; VI-NEXT: s_bfe_u32 s4, s0, 0x80008
229 ; VI-NEXT: s_or_b32 s1, s4, s1
230 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
231 ; VI-NEXT: s_lshl_b32 s4, s1, 16
232 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
233 ; VI-NEXT: s_or_b32 s1, s1, s4
234 ; VI-NEXT: s_or_b32 s0, s0, s4
235 ; VI-NEXT: v_mov_b32_e32 v0, s0
236 ; VI-NEXT: v_mov_b32_e32 v1, s1
237 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
240 ; GFX9-LABEL: scalar_to_vector_v4f16:
241 ; GFX9: ; %bb.0: ; %bb
242 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
243 ; GFX9-NEXT: s_mov_b32 s2, -1
244 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
245 ; GFX9-NEXT: s_waitcnt vmcnt(0)
246 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
247 ; GFX9-NEXT: s_lshl_b32 s1, s0, 8
248 ; GFX9-NEXT: s_or_b32 s0, s1, s0
249 ; GFX9-NEXT: s_and_b32 s1, s0, 0xff00
250 ; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008
251 ; GFX9-NEXT: s_or_b32 s1, s4, s1
252 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
253 ; GFX9-NEXT: s_and_b32 s4, s1, 0xffff
254 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16
255 ; GFX9-NEXT: s_or_b32 s4, s4, s1
256 ; GFX9-NEXT: s_or_b32 s0, s0, s1
257 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
258 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
259 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
260 ; GFX9-NEXT: s_endpgm
262 %load = load half, ptr addrspace(1) undef, align 1
263 %tmp = bitcast half %load to <2 x i8>
264 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
265 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
266 store <8 x i8> %tmp2, ptr addrspace(1) undef, align 8
270 ; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
271 ; to produce one, but for some reason never made it to selection.
274 ; define amdgpu_kernel void @scalar_to_vector_test2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
275 ; %tmp1 = load i32, ptr addrspace(1) %in, align 4
276 ; %bc = bitcast i32 %tmp1 to <4 x i8>
278 ; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
279 ; store <8 x i8> %tmp2, ptr addrspace(1) %out, align 4
283 ; define amdgpu_kernel void @scalar_to_vector_test3(ptr addrspace(1) %out) nounwind {
284 ; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
285 ; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
286 ; %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
287 ; %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
288 ; store <4 x i32> %add, ptr addrspace(1) %out, align 16
292 ; define amdgpu_kernel void @scalar_to_vector_test4(ptr addrspace(1) %out) nounwind {
293 ; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
294 ; %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
295 ; %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
296 ; store <8 x i16> %add, ptr addrspace(1) %out, align 16
300 ; define amdgpu_kernel void @scalar_to_vector_test5(ptr addrspace(1) %out) nounwind {
301 ; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
302 ; %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
303 ; %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
304 ; store <4 x i16> %add, ptr addrspace(1) %out, align 16
308 define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zeroext %val) nounwind {
309 ; SI-LABEL: scalar_to_vector_test6:
311 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
312 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
313 ; SI-NEXT: s_mov_b32 s3, 0xf000
314 ; SI-NEXT: s_mov_b32 s2, -1
315 ; SI-NEXT: s_waitcnt lgkmcnt(0)
316 ; SI-NEXT: v_mov_b32_e32 v0, s6
317 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
320 ; GFX89-LABEL: scalar_to_vector_test6:
322 ; GFX89-NEXT: s_load_dword s6, s[4:5], 0x2c
323 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
324 ; GFX89-NEXT: s_mov_b32 s3, 0xf000
325 ; GFX89-NEXT: s_mov_b32 s2, -1
326 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
327 ; GFX89-NEXT: v_mov_b32_e32 v0, s6
328 ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
329 ; GFX89-NEXT: s_endpgm
330 %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
331 %bc = bitcast <4 x i8> %newvec0 to <2 x half>
332 store <2 x half> %bc, ptr addrspace(1) %out
336 ; bitcast (scalar_to_vector x) -> any_extend x
337 define i64 @bitcast_combine_scalar_to_vector_v4i16(i16 %arg) {
338 ; SI-LABEL: bitcast_combine_scalar_to_vector_v4i16:
340 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
342 ; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
343 ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
344 ; SI-NEXT: v_or_b32_e32 v2, v0, v2
345 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
346 ; SI-NEXT: v_or_b32_e32 v0, v1, v3
347 ; SI-NEXT: v_or_b32_e32 v1, v2, v3
348 ; SI-NEXT: s_setpc_b64 s[30:31]
350 ; GFX89-LABEL: bitcast_combine_scalar_to_vector_v4i16:
352 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353 ; GFX89-NEXT: v_and_b32_e32 v1, 0xffffff00, v0
354 ; GFX89-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
355 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 16, v1
356 ; GFX89-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
357 ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
358 ; GFX89-NEXT: s_setpc_b64 s[30:31]
359 %arg.cast = bitcast i16 %arg to <2 x i8>
360 %tmp1 = shufflevector <2 x i8> %arg.cast, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
361 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
362 %cast = bitcast <8 x i8> %tmp2 to i64