1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
3 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
5 define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
6 ; SI-LABEL: s_sext_i1_to_i32:
8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
9 ; SI-NEXT: s_mov_b32 s7, 0xf000
10 ; SI-NEXT: s_mov_b32 s6, -1
11 ; SI-NEXT: s_waitcnt lgkmcnt(0)
12 ; SI-NEXT: s_cmp_eq_u32 s2, s3
13 ; SI-NEXT: s_mov_b32 s4, s0
14 ; SI-NEXT: s_mov_b32 s5, s1
15 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
16 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
17 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
20 ; VI-LABEL: s_sext_i1_to_i32:
22 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
23 ; VI-NEXT: s_mov_b32 s7, 0xf000
24 ; VI-NEXT: s_mov_b32 s6, -1
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: s_cmp_eq_u32 s2, s3
27 ; VI-NEXT: s_mov_b32 s4, s0
28 ; VI-NEXT: s_mov_b32 s5, s1
29 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
30 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
31 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
33 %cmp = icmp eq i32 %a, %b
34 %sext = sext i1 %cmp to i32
35 store i32 %sext, ptr addrspace(1) %out, align 4
39 define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
40 ; SI-LABEL: test_s_sext_i32_to_i64:
41 ; SI: ; %bb.0: ; %entry
42 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
43 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
44 ; SI-NEXT: s_mov_b32 s3, 0xf000
45 ; SI-NEXT: s_mov_b32 s2, -1
46 ; SI-NEXT: s_waitcnt lgkmcnt(0)
47 ; SI-NEXT: s_mov_b32 s0, s4
48 ; SI-NEXT: s_mul_i32 s4, s6, s7
49 ; SI-NEXT: s_add_i32 s4, s4, s8
50 ; SI-NEXT: s_mov_b32 s1, s5
51 ; SI-NEXT: s_ashr_i32 s5, s4, 31
52 ; SI-NEXT: v_mov_b32_e32 v0, s4
53 ; SI-NEXT: v_mov_b32_e32 v1, s5
54 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
57 ; VI-LABEL: test_s_sext_i32_to_i64:
58 ; VI: ; %bb.0: ; %entry
59 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
60 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
61 ; VI-NEXT: s_mov_b32 s3, 0xf000
62 ; VI-NEXT: s_mov_b32 s2, -1
63 ; VI-NEXT: s_waitcnt lgkmcnt(0)
64 ; VI-NEXT: s_mov_b32 s0, s4
65 ; VI-NEXT: s_mul_i32 s4, s6, s7
66 ; VI-NEXT: s_add_i32 s4, s4, s8
67 ; VI-NEXT: s_mov_b32 s1, s5
68 ; VI-NEXT: s_ashr_i32 s5, s4, 31
69 ; VI-NEXT: v_mov_b32_e32 v0, s4
70 ; VI-NEXT: v_mov_b32_e32 v1, s5
71 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
75 %add = add i32 %mul, %c
76 %sext = sext i32 %add to i64
77 store i64 %sext, ptr addrspace(1) %out, align 8
81 define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
82 ; SI-LABEL: s_sext_i1_to_i64:
84 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
85 ; SI-NEXT: s_mov_b32 s7, 0xf000
86 ; SI-NEXT: s_mov_b32 s6, -1
87 ; SI-NEXT: s_waitcnt lgkmcnt(0)
88 ; SI-NEXT: s_cmp_eq_u32 s2, s3
89 ; SI-NEXT: s_mov_b32 s4, s0
90 ; SI-NEXT: s_mov_b32 s5, s1
91 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
92 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
93 ; SI-NEXT: v_mov_b32_e32 v1, v0
94 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
97 ; VI-LABEL: s_sext_i1_to_i64:
99 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
100 ; VI-NEXT: s_mov_b32 s7, 0xf000
101 ; VI-NEXT: s_mov_b32 s6, -1
102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
103 ; VI-NEXT: s_cmp_eq_u32 s2, s3
104 ; VI-NEXT: s_mov_b32 s4, s0
105 ; VI-NEXT: s_mov_b32 s5, s1
106 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
107 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
108 ; VI-NEXT: v_mov_b32_e32 v1, v0
109 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
111 %cmp = icmp eq i32 %a, %b
112 %sext = sext i1 %cmp to i64
113 store i64 %sext, ptr addrspace(1) %out, align 8
117 define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind {
118 ; SI-LABEL: s_sext_i32_to_i64:
120 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
121 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
122 ; SI-NEXT: s_mov_b32 s3, 0xf000
123 ; SI-NEXT: s_mov_b32 s2, -1
124 ; SI-NEXT: s_waitcnt lgkmcnt(0)
125 ; SI-NEXT: s_ashr_i32 s5, s4, 31
126 ; SI-NEXT: v_mov_b32_e32 v0, s4
127 ; SI-NEXT: v_mov_b32_e32 v1, s5
128 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
131 ; VI-LABEL: s_sext_i32_to_i64:
133 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
134 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
135 ; VI-NEXT: s_mov_b32 s3, 0xf000
136 ; VI-NEXT: s_mov_b32 s2, -1
137 ; VI-NEXT: s_waitcnt lgkmcnt(0)
138 ; VI-NEXT: s_ashr_i32 s5, s4, 31
139 ; VI-NEXT: v_mov_b32_e32 v0, s4
140 ; VI-NEXT: v_mov_b32_e32 v1, s5
141 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
143 %sext = sext i32 %a to i64
144 store i64 %sext, ptr addrspace(1) %out, align 8
148 define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
149 ; SI-LABEL: v_sext_i32_to_i64:
151 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
152 ; SI-NEXT: s_mov_b32 s7, 0xf000
153 ; SI-NEXT: s_mov_b32 s6, -1
154 ; SI-NEXT: s_mov_b32 s10, s6
155 ; SI-NEXT: s_mov_b32 s11, s7
156 ; SI-NEXT: s_waitcnt lgkmcnt(0)
157 ; SI-NEXT: s_mov_b32 s8, s2
158 ; SI-NEXT: s_mov_b32 s9, s3
159 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
160 ; SI-NEXT: s_mov_b32 s4, s0
161 ; SI-NEXT: s_mov_b32 s5, s1
162 ; SI-NEXT: s_waitcnt vmcnt(0)
163 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
164 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
167 ; VI-LABEL: v_sext_i32_to_i64:
169 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
170 ; VI-NEXT: s_mov_b32 s7, 0xf000
171 ; VI-NEXT: s_mov_b32 s6, -1
172 ; VI-NEXT: s_mov_b32 s10, s6
173 ; VI-NEXT: s_mov_b32 s11, s7
174 ; VI-NEXT: s_waitcnt lgkmcnt(0)
175 ; VI-NEXT: s_mov_b32 s8, s2
176 ; VI-NEXT: s_mov_b32 s9, s3
177 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
178 ; VI-NEXT: s_mov_b32 s4, s0
179 ; VI-NEXT: s_mov_b32 s5, s1
180 ; VI-NEXT: s_waitcnt vmcnt(0)
181 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
182 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
184 %val = load i32, ptr addrspace(1) %in, align 4
185 %sext = sext i32 %val to i64
186 store i64 %sext, ptr addrspace(1) %out, align 8
190 define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind {
191 ; SI-LABEL: s_sext_i16_to_i64:
193 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
194 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
195 ; SI-NEXT: s_mov_b32 s3, 0xf000
196 ; SI-NEXT: s_mov_b32 s2, -1
197 ; SI-NEXT: s_waitcnt lgkmcnt(0)
198 ; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
199 ; SI-NEXT: v_mov_b32_e32 v0, s4
200 ; SI-NEXT: v_mov_b32_e32 v1, s5
201 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
204 ; VI-LABEL: s_sext_i16_to_i64:
206 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
207 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
208 ; VI-NEXT: s_mov_b32 s3, 0xf000
209 ; VI-NEXT: s_mov_b32 s2, -1
210 ; VI-NEXT: s_waitcnt lgkmcnt(0)
211 ; VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
212 ; VI-NEXT: v_mov_b32_e32 v0, s4
213 ; VI-NEXT: v_mov_b32_e32 v1, s5
214 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
216 %sext = sext i16 %a to i64
217 store i64 %sext, ptr addrspace(1) %out, align 8
221 define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
222 ; SI-LABEL: s_sext_i1_to_i16:
224 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
225 ; SI-NEXT: s_mov_b32 s7, 0xf000
226 ; SI-NEXT: s_mov_b32 s6, -1
227 ; SI-NEXT: s_waitcnt lgkmcnt(0)
228 ; SI-NEXT: s_cmp_eq_u32 s2, s3
229 ; SI-NEXT: s_mov_b32 s4, s0
230 ; SI-NEXT: s_mov_b32 s5, s1
231 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
232 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
233 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
236 ; VI-LABEL: s_sext_i1_to_i16:
238 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
239 ; VI-NEXT: s_mov_b32 s7, 0xf000
240 ; VI-NEXT: s_mov_b32 s6, -1
241 ; VI-NEXT: s_waitcnt lgkmcnt(0)
242 ; VI-NEXT: s_cmp_eq_u32 s2, s3
243 ; VI-NEXT: s_mov_b32 s4, s0
244 ; VI-NEXT: s_mov_b32 s5, s1
245 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
246 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
247 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
249 %cmp = icmp eq i32 %a, %b
250 %sext = sext i1 %cmp to i16
251 store i16 %sext, ptr addrspace(1) %out
255 ; This purpose of this test is to make sure the i16 = sign_extend i1 node
256 ; makes it all the way throught the legalizer/optimizer to make sure
257 ; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node
258 ; is optimized to a select very early.
259 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
260 ; SI-LABEL: s_sext_i1_to_i16_with_and:
262 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
263 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
264 ; SI-NEXT: s_mov_b32 s3, 0xf000
265 ; SI-NEXT: s_mov_b32 s2, -1
266 ; SI-NEXT: s_waitcnt lgkmcnt(0)
267 ; SI-NEXT: s_cmp_eq_u32 s4, s5
268 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
269 ; SI-NEXT: s_cmp_eq_u32 s6, s7
270 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0
271 ; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
272 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
273 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
276 ; VI-LABEL: s_sext_i1_to_i16_with_and:
278 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
279 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
280 ; VI-NEXT: s_mov_b32 s3, 0xf000
281 ; VI-NEXT: s_mov_b32 s2, -1
282 ; VI-NEXT: s_waitcnt lgkmcnt(0)
283 ; VI-NEXT: s_cmp_eq_u32 s4, s5
284 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
285 ; VI-NEXT: s_cmp_eq_u32 s6, s7
286 ; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
287 ; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
288 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
289 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
291 %cmp0 = icmp eq i32 %a, %b
292 %cmp1 = icmp eq i32 %c, %d
293 %cmp = and i1 %cmp0, %cmp1
294 %sext = sext i1 %cmp to i16
295 store i16 %sext, ptr addrspace(1) %out
299 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
300 ; SI-LABEL: v_sext_i1_to_i16_with_and:
302 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
303 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
304 ; SI-NEXT: s_mov_b32 s3, 0xf000
305 ; SI-NEXT: s_mov_b32 s2, -1
306 ; SI-NEXT: s_waitcnt lgkmcnt(0)
307 ; SI-NEXT: s_mov_b32 s0, s4
308 ; SI-NEXT: s_cmp_eq_u32 s7, s8
309 ; SI-NEXT: s_mov_b32 s1, s5
310 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0
311 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
312 ; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
313 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
314 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
317 ; VI-LABEL: v_sext_i1_to_i16_with_and:
319 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
320 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
321 ; VI-NEXT: s_mov_b32 s3, 0xf000
322 ; VI-NEXT: s_mov_b32 s2, -1
323 ; VI-NEXT: s_waitcnt lgkmcnt(0)
324 ; VI-NEXT: s_mov_b32 s0, s4
325 ; VI-NEXT: s_cmp_eq_u32 s7, s8
326 ; VI-NEXT: s_mov_b32 s1, s5
327 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0
328 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
329 ; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
330 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
331 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
333 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
334 %cmp0 = icmp eq i32 %a, %tid
335 %cmp1 = icmp eq i32 %b, %c
336 %cmp = and i1 %cmp0, %cmp1
337 %sext = sext i1 %cmp to i16
338 store i16 %sext, ptr addrspace(1) %out
342 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
343 ; gets selected to a v_lshrrev_b16 instructions, so the input to
344 ; the bfe is a vector registers. To fix this we need to be able to
346 ; t29: i16 = truncate t10
347 ; t55: i16 = srl t29, Constant:i32<8>
348 ; t63: i32 = any_extend t55
349 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
350 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind {
351 ; SI-LABEL: s_sext_v4i8_to_v4i32:
353 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
354 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
355 ; SI-NEXT: s_mov_b32 s3, 0xf000
356 ; SI-NEXT: s_mov_b32 s2, -1
357 ; SI-NEXT: s_waitcnt lgkmcnt(0)
358 ; SI-NEXT: s_ashr_i32 s5, s4, 24
359 ; SI-NEXT: s_bfe_i32 s6, s4, 0x80010
360 ; SI-NEXT: s_bfe_i32 s7, s4, 0x80008
361 ; SI-NEXT: s_sext_i32_i8 s4, s4
362 ; SI-NEXT: v_mov_b32_e32 v0, s4
363 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
364 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
365 ; SI-NEXT: v_mov_b32_e32 v0, s7
366 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
367 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
368 ; SI-NEXT: v_mov_b32_e32 v0, s6
369 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
370 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
371 ; SI-NEXT: v_mov_b32_e32 v0, s5
372 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
373 ; SI-NEXT: s_waitcnt vmcnt(0)
376 ; VI-LABEL: s_sext_v4i8_to_v4i32:
378 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
379 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
380 ; VI-NEXT: s_mov_b32 s3, 0xf000
381 ; VI-NEXT: s_mov_b32 s2, -1
382 ; VI-NEXT: s_waitcnt lgkmcnt(0)
383 ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4
384 ; VI-NEXT: s_ashr_i32 s5, s4, 24
385 ; VI-NEXT: s_bfe_i32 s6, s4, 0x80010
386 ; VI-NEXT: s_sext_i32_i8 s4, s4
387 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
388 ; VI-NEXT: v_mov_b32_e32 v1, s4
389 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
390 ; VI-NEXT: s_waitcnt vmcnt(0)
391 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
392 ; VI-NEXT: s_waitcnt vmcnt(0)
393 ; VI-NEXT: v_mov_b32_e32 v0, s6
394 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
395 ; VI-NEXT: s_waitcnt vmcnt(0)
396 ; VI-NEXT: v_mov_b32_e32 v0, s5
397 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
398 ; VI-NEXT: s_waitcnt vmcnt(0)
400 %cast = bitcast i32 %a to <4 x i8>
401 %ext = sext <4 x i8> %cast to <4 x i32>
402 %elt0 = extractelement <4 x i32> %ext, i32 0
403 %elt1 = extractelement <4 x i32> %ext, i32 1
404 %elt2 = extractelement <4 x i32> %ext, i32 2
405 %elt3 = extractelement <4 x i32> %ext, i32 3
406 store volatile i32 %elt0, ptr addrspace(1) %out
407 store volatile i32 %elt1, ptr addrspace(1) %out
408 store volatile i32 %elt2, ptr addrspace(1) %out
409 store volatile i32 %elt3, ptr addrspace(1) %out
413 ; FIXME: need to optimize same sequence as above test to avoid
415 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
416 ; SI-LABEL: v_sext_v4i8_to_v4i32:
418 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
419 ; SI-NEXT: s_mov_b32 s7, 0xf000
420 ; SI-NEXT: s_mov_b32 s6, -1
421 ; SI-NEXT: s_mov_b32 s10, s6
422 ; SI-NEXT: s_mov_b32 s11, s7
423 ; SI-NEXT: s_waitcnt lgkmcnt(0)
424 ; SI-NEXT: s_mov_b32 s8, s2
425 ; SI-NEXT: s_mov_b32 s9, s3
426 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
427 ; SI-NEXT: s_mov_b32 s4, s0
428 ; SI-NEXT: s_mov_b32 s5, s1
429 ; SI-NEXT: s_waitcnt vmcnt(0)
430 ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
431 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
432 ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
433 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
434 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
435 ; SI-NEXT: s_waitcnt vmcnt(0)
436 ; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
437 ; SI-NEXT: s_waitcnt vmcnt(0)
438 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
439 ; SI-NEXT: s_waitcnt vmcnt(0)
440 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
441 ; SI-NEXT: s_waitcnt vmcnt(0)
444 ; VI-LABEL: v_sext_v4i8_to_v4i32:
446 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
447 ; VI-NEXT: s_mov_b32 s7, 0xf000
448 ; VI-NEXT: s_mov_b32 s6, -1
449 ; VI-NEXT: s_mov_b32 s10, s6
450 ; VI-NEXT: s_mov_b32 s11, s7
451 ; VI-NEXT: s_waitcnt lgkmcnt(0)
452 ; VI-NEXT: s_mov_b32 s8, s2
453 ; VI-NEXT: s_mov_b32 s9, s3
454 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
455 ; VI-NEXT: s_mov_b32 s4, s0
456 ; VI-NEXT: s_mov_b32 s5, s1
457 ; VI-NEXT: s_waitcnt vmcnt(0)
458 ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
459 ; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
460 ; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
461 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
462 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
463 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
464 ; VI-NEXT: s_waitcnt vmcnt(0)
465 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
466 ; VI-NEXT: s_waitcnt vmcnt(0)
467 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
468 ; VI-NEXT: s_waitcnt vmcnt(0)
469 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
470 ; VI-NEXT: s_waitcnt vmcnt(0)
472 %a = load i32, ptr addrspace(1) %in
473 %cast = bitcast i32 %a to <4 x i8>
474 %ext = sext <4 x i8> %cast to <4 x i32>
475 %elt0 = extractelement <4 x i32> %ext, i32 0
476 %elt1 = extractelement <4 x i32> %ext, i32 1
477 %elt2 = extractelement <4 x i32> %ext, i32 2
478 %elt3 = extractelement <4 x i32> %ext, i32 3
479 store volatile i32 %elt0, ptr addrspace(1) %out
480 store volatile i32 %elt1, ptr addrspace(1) %out
481 store volatile i32 %elt2, ptr addrspace(1) %out
482 store volatile i32 %elt3, ptr addrspace(1) %out
486 ; FIXME: s_bfe_i64, same on SI and VI
487 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind {
488 ; SI-LABEL: s_sext_v4i16_to_v4i32:
490 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
491 ; SI-NEXT: s_mov_b32 s7, 0xf000
492 ; SI-NEXT: s_mov_b32 s6, -1
493 ; SI-NEXT: s_waitcnt lgkmcnt(0)
494 ; SI-NEXT: s_mov_b32 s4, s0
495 ; SI-NEXT: s_mov_b32 s5, s1
496 ; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48
497 ; SI-NEXT: s_ashr_i32 s1, s2, 16
498 ; SI-NEXT: s_sext_i32_i16 s2, s2
499 ; SI-NEXT: v_mov_b32_e32 v0, s2
500 ; SI-NEXT: s_sext_i32_i16 s3, s3
501 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
502 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
503 ; SI-NEXT: v_mov_b32_e32 v0, s1
504 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
505 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
506 ; SI-NEXT: v_mov_b32_e32 v0, s3
507 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
508 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
509 ; SI-NEXT: v_mov_b32_e32 v0, s0
510 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
511 ; SI-NEXT: s_waitcnt vmcnt(0)
514 ; VI-LABEL: s_sext_v4i16_to_v4i32:
516 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
517 ; VI-NEXT: s_mov_b32 s7, 0xf000
518 ; VI-NEXT: s_mov_b32 s6, -1
519 ; VI-NEXT: s_waitcnt lgkmcnt(0)
520 ; VI-NEXT: s_mov_b32 s5, s1
521 ; VI-NEXT: s_ashr_i32 s1, s2, 16
522 ; VI-NEXT: s_sext_i32_i16 s2, s2
523 ; VI-NEXT: s_mov_b32 s4, s0
524 ; VI-NEXT: v_mov_b32_e32 v0, s2
525 ; VI-NEXT: s_ashr_i32 s0, s3, 16
526 ; VI-NEXT: s_sext_i32_i16 s3, s3
527 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
528 ; VI-NEXT: s_waitcnt vmcnt(0)
529 ; VI-NEXT: v_mov_b32_e32 v0, s1
530 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
531 ; VI-NEXT: s_waitcnt vmcnt(0)
532 ; VI-NEXT: v_mov_b32_e32 v0, s3
533 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
534 ; VI-NEXT: s_waitcnt vmcnt(0)
535 ; VI-NEXT: v_mov_b32_e32 v0, s0
536 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
537 ; VI-NEXT: s_waitcnt vmcnt(0)
539 %cast = bitcast i64 %a to <4 x i16>
540 %ext = sext <4 x i16> %cast to <4 x i32>
541 %elt0 = extractelement <4 x i32> %ext, i32 0
542 %elt1 = extractelement <4 x i32> %ext, i32 1
543 %elt2 = extractelement <4 x i32> %ext, i32 2
544 %elt3 = extractelement <4 x i32> %ext, i32 3
545 store volatile i32 %elt0, ptr addrspace(1) %out
546 store volatile i32 %elt1, ptr addrspace(1) %out
547 store volatile i32 %elt2, ptr addrspace(1) %out
548 store volatile i32 %elt3, ptr addrspace(1) %out
552 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
553 ; SI-LABEL: v_sext_v4i16_to_v4i32:
555 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
556 ; SI-NEXT: s_mov_b32 s7, 0xf000
557 ; SI-NEXT: s_mov_b32 s6, -1
558 ; SI-NEXT: s_mov_b32 s10, s6
559 ; SI-NEXT: s_mov_b32 s11, s7
560 ; SI-NEXT: s_waitcnt lgkmcnt(0)
561 ; SI-NEXT: s_mov_b32 s8, s2
562 ; SI-NEXT: s_mov_b32 s9, s3
563 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
564 ; SI-NEXT: s_mov_b32 s4, s0
565 ; SI-NEXT: s_mov_b32 s5, s1
566 ; SI-NEXT: s_waitcnt vmcnt(0)
567 ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
568 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
569 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
570 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
571 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
572 ; SI-NEXT: s_waitcnt vmcnt(0)
573 ; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
574 ; SI-NEXT: s_waitcnt vmcnt(0)
575 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
576 ; SI-NEXT: s_waitcnt vmcnt(0)
577 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
578 ; SI-NEXT: s_waitcnt vmcnt(0)
581 ; VI-LABEL: v_sext_v4i16_to_v4i32:
583 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
584 ; VI-NEXT: s_mov_b32 s7, 0xf000
585 ; VI-NEXT: s_mov_b32 s6, -1
586 ; VI-NEXT: s_mov_b32 s10, s6
587 ; VI-NEXT: s_mov_b32 s11, s7
588 ; VI-NEXT: s_waitcnt lgkmcnt(0)
589 ; VI-NEXT: s_mov_b32 s8, s2
590 ; VI-NEXT: s_mov_b32 s9, s3
591 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
592 ; VI-NEXT: s_mov_b32 s4, s0
593 ; VI-NEXT: s_mov_b32 s5, s1
594 ; VI-NEXT: s_waitcnt vmcnt(0)
595 ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
596 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
597 ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
598 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
599 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
600 ; VI-NEXT: s_waitcnt vmcnt(0)
601 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
602 ; VI-NEXT: s_waitcnt vmcnt(0)
603 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
604 ; VI-NEXT: s_waitcnt vmcnt(0)
605 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
606 ; VI-NEXT: s_waitcnt vmcnt(0)
608 %a = load i64, ptr addrspace(1) %in
609 %cast = bitcast i64 %a to <4 x i16>
610 %ext = sext <4 x i16> %cast to <4 x i32>
611 %elt0 = extractelement <4 x i32> %ext, i32 0
612 %elt1 = extractelement <4 x i32> %ext, i32 1
613 %elt2 = extractelement <4 x i32> %ext, i32 2
614 %elt3 = extractelement <4 x i32> %ext, i32 3
615 store volatile i32 %elt0, ptr addrspace(1) %out
616 store volatile i32 %elt1, ptr addrspace(1) %out
617 store volatile i32 %elt2, ptr addrspace(1) %out
618 store volatile i32 %elt3, ptr addrspace(1) %out
622 declare i32 @llvm.amdgcn.workitem.id.x() #1
624 attributes #1 = { nounwind readnone }