1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
3 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
5 define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
6 ; SI-LABEL: s_sext_i1_to_i32:
8 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
10 ; SI-NEXT: s_mov_b32 s7, 0xf000
11 ; SI-NEXT: s_mov_b32 s6, -1
12 ; SI-NEXT: s_waitcnt lgkmcnt(0)
13 ; SI-NEXT: v_mov_b32_e32 v0, s1
14 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
15 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
16 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
19 ; VI-LABEL: s_sext_i1_to_i32:
21 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
22 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
23 ; VI-NEXT: s_mov_b32 s7, 0xf000
24 ; VI-NEXT: s_mov_b32 s6, -1
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: v_mov_b32_e32 v0, s1
27 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
28 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
29 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
31 %cmp = icmp eq i32 %a, %b
32 %sext = sext i1 %cmp to i32
33 store i32 %sext, i32 addrspace(1)* %out, align 4
37 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
38 ; SI-LABEL: test_s_sext_i32_to_i64:
39 ; SI: ; %bb.0: ; %entry
40 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
41 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
42 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
43 ; SI-NEXT: s_mov_b32 s7, 0xf000
44 ; SI-NEXT: s_mov_b32 s6, -1
45 ; SI-NEXT: s_waitcnt lgkmcnt(0)
46 ; SI-NEXT: s_mul_i32 s1, s2, s3
47 ; SI-NEXT: s_add_i32 s1, s1, s0
48 ; SI-NEXT: s_ashr_i32 s0, s1, 31
49 ; SI-NEXT: v_mov_b32_e32 v0, s1
50 ; SI-NEXT: v_mov_b32_e32 v1, s0
51 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
54 ; VI-LABEL: test_s_sext_i32_to_i64:
55 ; VI: ; %bb.0: ; %entry
56 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
57 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
58 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
59 ; VI-NEXT: s_mov_b32 s7, 0xf000
60 ; VI-NEXT: s_mov_b32 s6, -1
61 ; VI-NEXT: s_waitcnt lgkmcnt(0)
62 ; VI-NEXT: s_mul_i32 s1, s2, s3
63 ; VI-NEXT: s_add_i32 s1, s1, s0
64 ; VI-NEXT: s_ashr_i32 s0, s1, 31
65 ; VI-NEXT: v_mov_b32_e32 v0, s1
66 ; VI-NEXT: v_mov_b32_e32 v1, s0
67 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
71 %add = add i32 %mul, %c
72 %sext = sext i32 %add to i64
73 store i64 %sext, i64 addrspace(1)* %out, align 8
77 define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
78 ; SI-LABEL: s_sext_i1_to_i64:
80 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
81 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
82 ; SI-NEXT: s_mov_b32 s7, 0xf000
83 ; SI-NEXT: s_mov_b32 s6, -1
84 ; SI-NEXT: s_waitcnt lgkmcnt(0)
85 ; SI-NEXT: v_mov_b32_e32 v0, s1
86 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
87 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
88 ; SI-NEXT: v_mov_b32_e32 v1, v0
89 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
92 ; VI-LABEL: s_sext_i1_to_i64:
94 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
95 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
96 ; VI-NEXT: s_mov_b32 s7, 0xf000
97 ; VI-NEXT: s_mov_b32 s6, -1
98 ; VI-NEXT: s_waitcnt lgkmcnt(0)
99 ; VI-NEXT: v_mov_b32_e32 v0, s1
100 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
101 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
102 ; VI-NEXT: v_mov_b32_e32 v1, v0
103 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
105 %cmp = icmp eq i32 %a, %b
106 %sext = sext i1 %cmp to i64
107 store i64 %sext, i64 addrspace(1)* %out, align 8
111 define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
112 ; SI-LABEL: s_sext_i32_to_i64:
114 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
115 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
116 ; SI-NEXT: s_mov_b32 s7, 0xf000
117 ; SI-NEXT: s_mov_b32 s6, -1
118 ; SI-NEXT: s_waitcnt lgkmcnt(0)
119 ; SI-NEXT: s_ashr_i32 s1, s0, 31
120 ; SI-NEXT: v_mov_b32_e32 v0, s0
121 ; SI-NEXT: v_mov_b32_e32 v1, s1
122 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
125 ; VI-LABEL: s_sext_i32_to_i64:
127 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
128 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
129 ; VI-NEXT: s_mov_b32 s7, 0xf000
130 ; VI-NEXT: s_mov_b32 s6, -1
131 ; VI-NEXT: s_waitcnt lgkmcnt(0)
132 ; VI-NEXT: s_ashr_i32 s1, s0, 31
133 ; VI-NEXT: v_mov_b32_e32 v0, s0
134 ; VI-NEXT: v_mov_b32_e32 v1, s1
135 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
137 %sext = sext i32 %a to i64
138 store i64 %sext, i64 addrspace(1)* %out, align 8
142 define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
143 ; SI-LABEL: v_sext_i32_to_i64:
145 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
146 ; SI-NEXT: s_mov_b32 s3, 0xf000
147 ; SI-NEXT: s_mov_b32 s2, -1
148 ; SI-NEXT: s_waitcnt lgkmcnt(0)
149 ; SI-NEXT: s_mov_b32 s0, s4
150 ; SI-NEXT: s_mov_b32 s1, s5
151 ; SI-NEXT: s_mov_b32 s4, s6
152 ; SI-NEXT: s_mov_b32 s5, s7
153 ; SI-NEXT: s_mov_b32 s6, s2
154 ; SI-NEXT: s_mov_b32 s7, s3
155 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
156 ; SI-NEXT: s_waitcnt vmcnt(0)
157 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
158 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
161 ; VI-LABEL: v_sext_i32_to_i64:
163 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
164 ; VI-NEXT: s_mov_b32 s3, 0xf000
165 ; VI-NEXT: s_mov_b32 s2, -1
166 ; VI-NEXT: s_waitcnt lgkmcnt(0)
167 ; VI-NEXT: s_mov_b32 s0, s4
168 ; VI-NEXT: s_mov_b32 s1, s5
169 ; VI-NEXT: s_mov_b32 s4, s6
170 ; VI-NEXT: s_mov_b32 s5, s7
171 ; VI-NEXT: s_mov_b32 s6, s2
172 ; VI-NEXT: s_mov_b32 s7, s3
173 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
174 ; VI-NEXT: s_waitcnt vmcnt(0)
175 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
176 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
178 %val = load i32, i32 addrspace(1)* %in, align 4
179 %sext = sext i32 %val to i64
180 store i64 %sext, i64 addrspace(1)* %out, align 8
184 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
185 ; SI-LABEL: s_sext_i16_to_i64:
187 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
188 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
189 ; SI-NEXT: s_mov_b32 s7, 0xf000
190 ; SI-NEXT: s_mov_b32 s6, -1
191 ; SI-NEXT: s_waitcnt lgkmcnt(0)
192 ; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
193 ; SI-NEXT: v_mov_b32_e32 v0, s0
194 ; SI-NEXT: v_mov_b32_e32 v1, s1
195 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
198 ; VI-LABEL: s_sext_i16_to_i64:
200 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
201 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
202 ; VI-NEXT: s_mov_b32 s7, 0xf000
203 ; VI-NEXT: s_mov_b32 s6, -1
204 ; VI-NEXT: s_waitcnt lgkmcnt(0)
205 ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
206 ; VI-NEXT: v_mov_b32_e32 v0, s0
207 ; VI-NEXT: v_mov_b32_e32 v1, s1
208 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
210 %sext = sext i16 %a to i64
211 store i64 %sext, i64 addrspace(1)* %out, align 8
215 define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
216 ; SI-LABEL: s_sext_i1_to_i16:
218 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
219 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
220 ; SI-NEXT: s_mov_b32 s7, 0xf000
221 ; SI-NEXT: s_mov_b32 s6, -1
222 ; SI-NEXT: s_waitcnt lgkmcnt(0)
223 ; SI-NEXT: v_mov_b32_e32 v0, s1
224 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
225 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
226 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
229 ; VI-LABEL: s_sext_i1_to_i16:
231 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
232 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
233 ; VI-NEXT: s_mov_b32 s7, 0xf000
234 ; VI-NEXT: s_mov_b32 s6, -1
235 ; VI-NEXT: s_waitcnt lgkmcnt(0)
236 ; VI-NEXT: v_mov_b32_e32 v0, s1
237 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
238 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
239 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
241 %cmp = icmp eq i32 %a, %b
242 %sext = sext i1 %cmp to i16
243 store i16 %sext, i16 addrspace(1)* %out
247 ; This purpose of this test is to make sure the i16 = sign_extend i1 node
248 ; makes it all the way throught the legalizer/optimizer to make sure
249 ; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node
250 ; is optimized to a select very early.
251 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
252 ; SI-LABEL: s_sext_i1_to_i16_with_and:
254 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
255 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
256 ; SI-NEXT: s_mov_b32 s7, 0xf000
257 ; SI-NEXT: s_mov_b32 s6, -1
258 ; SI-NEXT: s_waitcnt lgkmcnt(0)
259 ; SI-NEXT: v_mov_b32_e32 v0, s1
260 ; SI-NEXT: v_mov_b32_e32 v1, s3
261 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
262 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
263 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
264 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
265 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
268 ; VI-LABEL: s_sext_i1_to_i16_with_and:
270 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
271 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
272 ; VI-NEXT: s_mov_b32 s7, 0xf000
273 ; VI-NEXT: s_mov_b32 s6, -1
274 ; VI-NEXT: s_waitcnt lgkmcnt(0)
275 ; VI-NEXT: v_mov_b32_e32 v0, s1
276 ; VI-NEXT: v_mov_b32_e32 v1, s3
277 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
278 ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
279 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
280 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
281 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
283 %cmp0 = icmp eq i32 %a, %b
284 %cmp1 = icmp eq i32 %c, %d
285 %cmp = and i1 %cmp0, %cmp1
286 %sext = sext i1 %cmp to i16
287 store i16 %sext, i16 addrspace(1)* %out
291 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
292 ; SI-LABEL: v_sext_i1_to_i16_with_and:
294 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
295 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
296 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
297 ; SI-NEXT: s_mov_b32 s7, 0xf000
298 ; SI-NEXT: s_mov_b32 s6, -1
299 ; SI-NEXT: s_waitcnt lgkmcnt(0)
300 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
301 ; SI-NEXT: v_mov_b32_e32 v0, s0
302 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
303 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
304 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
305 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
308 ; VI-LABEL: v_sext_i1_to_i16_with_and:
310 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
311 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
312 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
313 ; VI-NEXT: s_mov_b32 s7, 0xf000
314 ; VI-NEXT: s_mov_b32 s6, -1
315 ; VI-NEXT: s_waitcnt lgkmcnt(0)
316 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
317 ; VI-NEXT: v_mov_b32_e32 v0, s0
318 ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
319 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
320 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
321 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
324 %cmp0 = icmp eq i32 %a, %tid
325 %cmp1 = icmp eq i32 %b, %c
326 %cmp = and i1 %cmp0, %cmp1
327 %sext = sext i1 %cmp to i16
328 store i16 %sext, i16 addrspace(1)* %out
332 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
333 ; gets selected to a v_lshrrev_b16 instructions, so the input to
334 ; the bfe is a vector registers. To fix this we need to be able to
336 ; t29: i16 = truncate t10
337 ; t55: i16 = srl t29, Constant:i32<8>
338 ; t63: i32 = any_extend t55
339 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
340 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
341 ; SI-LABEL: s_sext_v4i8_to_v4i32:
343 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
344 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
345 ; SI-NEXT: s_mov_b32 s7, 0xf000
346 ; SI-NEXT: s_mov_b32 s6, -1
347 ; SI-NEXT: s_waitcnt lgkmcnt(0)
348 ; SI-NEXT: s_ashr_i32 s1, s0, 24
349 ; SI-NEXT: s_bfe_i32 s2, s0, 0x80010
350 ; SI-NEXT: s_bfe_i32 s3, s0, 0x80008
351 ; SI-NEXT: s_sext_i32_i8 s0, s0
352 ; SI-NEXT: v_mov_b32_e32 v0, s0
353 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
354 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
355 ; SI-NEXT: v_mov_b32_e32 v0, s3
356 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
357 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
358 ; SI-NEXT: v_mov_b32_e32 v0, s2
359 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
360 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
361 ; SI-NEXT: v_mov_b32_e32 v0, s1
362 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
363 ; SI-NEXT: s_waitcnt vmcnt(0)
366 ; VI-LABEL: s_sext_v4i8_to_v4i32:
368 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
369 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
370 ; VI-NEXT: s_mov_b32 s7, 0xf000
371 ; VI-NEXT: s_mov_b32 s6, -1
372 ; VI-NEXT: s_waitcnt lgkmcnt(0)
373 ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0
374 ; VI-NEXT: s_ashr_i32 s1, s0, 24
375 ; VI-NEXT: s_bfe_i32 s2, s0, 0x80010
376 ; VI-NEXT: s_sext_i32_i8 s0, s0
377 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
378 ; VI-NEXT: v_mov_b32_e32 v1, s0
379 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
380 ; VI-NEXT: s_waitcnt vmcnt(0)
381 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
382 ; VI-NEXT: s_waitcnt vmcnt(0)
383 ; VI-NEXT: v_mov_b32_e32 v0, s2
384 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
385 ; VI-NEXT: s_waitcnt vmcnt(0)
386 ; VI-NEXT: v_mov_b32_e32 v0, s1
387 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
388 ; VI-NEXT: s_waitcnt vmcnt(0)
390 %cast = bitcast i32 %a to <4 x i8>
391 %ext = sext <4 x i8> %cast to <4 x i32>
392 %elt0 = extractelement <4 x i32> %ext, i32 0
393 %elt1 = extractelement <4 x i32> %ext, i32 1
394 %elt2 = extractelement <4 x i32> %ext, i32 2
395 %elt3 = extractelement <4 x i32> %ext, i32 3
396 store volatile i32 %elt0, i32 addrspace(1)* %out
397 store volatile i32 %elt1, i32 addrspace(1)* %out
398 store volatile i32 %elt2, i32 addrspace(1)* %out
399 store volatile i32 %elt3, i32 addrspace(1)* %out
403 ; FIXME: need to optimize same sequence as above test to avoid
405 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
406 ; SI-LABEL: v_sext_v4i8_to_v4i32:
408 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
409 ; SI-NEXT: s_mov_b32 s3, 0xf000
410 ; SI-NEXT: s_mov_b32 s2, -1
411 ; SI-NEXT: s_mov_b32 s10, s2
412 ; SI-NEXT: s_mov_b32 s11, s3
413 ; SI-NEXT: s_waitcnt lgkmcnt(0)
414 ; SI-NEXT: s_mov_b32 s8, s6
415 ; SI-NEXT: s_mov_b32 s9, s7
416 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
417 ; SI-NEXT: s_mov_b32 s0, s4
418 ; SI-NEXT: s_mov_b32 s1, s5
419 ; SI-NEXT: s_waitcnt vmcnt(0)
420 ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
421 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
422 ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
423 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
424 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
425 ; SI-NEXT: s_waitcnt vmcnt(0)
426 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
427 ; SI-NEXT: s_waitcnt vmcnt(0)
428 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
429 ; SI-NEXT: s_waitcnt vmcnt(0)
430 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
431 ; SI-NEXT: s_waitcnt vmcnt(0)
434 ; VI-LABEL: v_sext_v4i8_to_v4i32:
436 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
437 ; VI-NEXT: s_mov_b32 s3, 0xf000
438 ; VI-NEXT: s_mov_b32 s2, -1
439 ; VI-NEXT: s_mov_b32 s10, s2
440 ; VI-NEXT: s_mov_b32 s11, s3
441 ; VI-NEXT: s_waitcnt lgkmcnt(0)
442 ; VI-NEXT: s_mov_b32 s8, s6
443 ; VI-NEXT: s_mov_b32 s9, s7
444 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
445 ; VI-NEXT: s_mov_b32 s0, s4
446 ; VI-NEXT: s_mov_b32 s1, s5
447 ; VI-NEXT: s_waitcnt vmcnt(0)
448 ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
449 ; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
450 ; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
451 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
452 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
453 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
454 ; VI-NEXT: s_waitcnt vmcnt(0)
455 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
456 ; VI-NEXT: s_waitcnt vmcnt(0)
457 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
458 ; VI-NEXT: s_waitcnt vmcnt(0)
459 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
460 ; VI-NEXT: s_waitcnt vmcnt(0)
462 %a = load i32, i32 addrspace(1)* %in
463 %cast = bitcast i32 %a to <4 x i8>
464 %ext = sext <4 x i8> %cast to <4 x i32>
465 %elt0 = extractelement <4 x i32> %ext, i32 0
466 %elt1 = extractelement <4 x i32> %ext, i32 1
467 %elt2 = extractelement <4 x i32> %ext, i32 2
468 %elt3 = extractelement <4 x i32> %ext, i32 3
469 store volatile i32 %elt0, i32 addrspace(1)* %out
470 store volatile i32 %elt1, i32 addrspace(1)* %out
471 store volatile i32 %elt2, i32 addrspace(1)* %out
472 store volatile i32 %elt3, i32 addrspace(1)* %out
476 ; FIXME: s_bfe_i64, same on SI and VI
477 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
478 ; SI-LABEL: s_sext_v4i16_to_v4i32:
480 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
481 ; SI-NEXT: s_mov_b32 s3, 0xf000
482 ; SI-NEXT: s_mov_b32 s2, -1
483 ; SI-NEXT: s_waitcnt lgkmcnt(0)
484 ; SI-NEXT: s_mov_b32 s0, s4
485 ; SI-NEXT: s_mov_b32 s1, s5
486 ; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 48
487 ; SI-NEXT: s_ashr_i32 s5, s6, 16
488 ; SI-NEXT: s_sext_i32_i16 s6, s6
489 ; SI-NEXT: v_mov_b32_e32 v0, s6
490 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
491 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
492 ; SI-NEXT: v_mov_b32_e32 v0, s5
493 ; SI-NEXT: s_sext_i32_i16 s7, s7
494 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
495 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
496 ; SI-NEXT: v_mov_b32_e32 v0, s7
497 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
498 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
499 ; SI-NEXT: v_mov_b32_e32 v0, s4
500 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
501 ; SI-NEXT: s_waitcnt vmcnt(0)
504 ; VI-LABEL: s_sext_v4i16_to_v4i32:
506 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
507 ; VI-NEXT: s_mov_b32 s3, 0xf000
508 ; VI-NEXT: s_mov_b32 s2, -1
509 ; VI-NEXT: s_waitcnt lgkmcnt(0)
510 ; VI-NEXT: s_mov_b32 s1, s5
511 ; VI-NEXT: s_ashr_i32 s5, s6, 16
512 ; VI-NEXT: s_sext_i32_i16 s6, s6
513 ; VI-NEXT: s_mov_b32 s0, s4
514 ; VI-NEXT: v_mov_b32_e32 v0, s6
515 ; VI-NEXT: s_ashr_i32 s4, s7, 16
516 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
517 ; VI-NEXT: s_waitcnt vmcnt(0)
518 ; VI-NEXT: v_mov_b32_e32 v0, s5
519 ; VI-NEXT: s_sext_i32_i16 s7, s7
520 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
521 ; VI-NEXT: s_waitcnt vmcnt(0)
522 ; VI-NEXT: v_mov_b32_e32 v0, s7
523 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
524 ; VI-NEXT: s_waitcnt vmcnt(0)
525 ; VI-NEXT: v_mov_b32_e32 v0, s4
526 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
527 ; VI-NEXT: s_waitcnt vmcnt(0)
529 %cast = bitcast i64 %a to <4 x i16>
530 %ext = sext <4 x i16> %cast to <4 x i32>
531 %elt0 = extractelement <4 x i32> %ext, i32 0
532 %elt1 = extractelement <4 x i32> %ext, i32 1
533 %elt2 = extractelement <4 x i32> %ext, i32 2
534 %elt3 = extractelement <4 x i32> %ext, i32 3
535 store volatile i32 %elt0, i32 addrspace(1)* %out
536 store volatile i32 %elt1, i32 addrspace(1)* %out
537 store volatile i32 %elt2, i32 addrspace(1)* %out
538 store volatile i32 %elt3, i32 addrspace(1)* %out
542 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
543 ; SI-LABEL: v_sext_v4i16_to_v4i32:
545 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
546 ; SI-NEXT: s_mov_b32 s3, 0xf000
547 ; SI-NEXT: s_mov_b32 s2, -1
548 ; SI-NEXT: s_mov_b32 s10, s2
549 ; SI-NEXT: s_mov_b32 s11, s3
550 ; SI-NEXT: s_waitcnt lgkmcnt(0)
551 ; SI-NEXT: s_mov_b32 s8, s6
552 ; SI-NEXT: s_mov_b32 s9, s7
553 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
554 ; SI-NEXT: s_mov_b32 s0, s4
555 ; SI-NEXT: s_mov_b32 s1, s5
556 ; SI-NEXT: s_waitcnt vmcnt(0)
557 ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
558 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
559 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
560 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
561 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
562 ; SI-NEXT: s_waitcnt vmcnt(0)
563 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
564 ; SI-NEXT: s_waitcnt vmcnt(0)
565 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
566 ; SI-NEXT: s_waitcnt vmcnt(0)
567 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
568 ; SI-NEXT: s_waitcnt vmcnt(0)
571 ; VI-LABEL: v_sext_v4i16_to_v4i32:
573 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
574 ; VI-NEXT: s_mov_b32 s3, 0xf000
575 ; VI-NEXT: s_mov_b32 s2, -1
576 ; VI-NEXT: s_mov_b32 s10, s2
577 ; VI-NEXT: s_mov_b32 s11, s3
578 ; VI-NEXT: s_waitcnt lgkmcnt(0)
579 ; VI-NEXT: s_mov_b32 s8, s6
580 ; VI-NEXT: s_mov_b32 s9, s7
581 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
582 ; VI-NEXT: s_mov_b32 s0, s4
583 ; VI-NEXT: s_mov_b32 s1, s5
584 ; VI-NEXT: s_waitcnt vmcnt(0)
585 ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
586 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
587 ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
588 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
589 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
590 ; VI-NEXT: s_waitcnt vmcnt(0)
591 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
592 ; VI-NEXT: s_waitcnt vmcnt(0)
593 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
594 ; VI-NEXT: s_waitcnt vmcnt(0)
595 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
596 ; VI-NEXT: s_waitcnt vmcnt(0)
598 %a = load i64, i64 addrspace(1)* %in
599 %cast = bitcast i64 %a to <4 x i16>
600 %ext = sext <4 x i16> %cast to <4 x i32>
601 %elt0 = extractelement <4 x i32> %ext, i32 0
602 %elt1 = extractelement <4 x i32> %ext, i32 1
603 %elt2 = extractelement <4 x i32> %ext, i32 2
604 %elt3 = extractelement <4 x i32> %ext, i32 3
605 store volatile i32 %elt0, i32 addrspace(1)* %out
606 store volatile i32 %elt1, i32 addrspace(1)* %out
607 store volatile i32 %elt2, i32 addrspace(1)* %out
608 store volatile i32 %elt3, i32 addrspace(1)* %out
612 declare i32 @llvm.amdgcn.workitem.id.x() #1
614 attributes #1 = { nounwind readnone }