1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI
3 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI
5 define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
6 ; SI-LABEL: s_sext_i1_to_i32:
8 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
10 ; SI-NEXT: s_mov_b32 s7, 0xf000
11 ; SI-NEXT: s_mov_b32 s6, -1
12 ; SI-NEXT: s_waitcnt lgkmcnt(0)
13 ; SI-NEXT: v_mov_b32_e32 v0, s1
14 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
15 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
16 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
19 ; VI-LABEL: s_sext_i1_to_i32:
21 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
22 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
23 ; VI-NEXT: s_mov_b32 s7, 0xf000
24 ; VI-NEXT: s_mov_b32 s6, -1
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: v_mov_b32_e32 v0, s1
27 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
28 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
29 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
31 %cmp = icmp eq i32 %a, %b
32 %sext = sext i1 %cmp to i32
33 store i32 %sext, i32 addrspace(1)* %out, align 4
37 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
38 ; SI-LABEL: test_s_sext_i32_to_i64:
39 ; SI: ; %bb.0: ; %entry
40 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
41 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
42 ; SI-NEXT: s_mov_b32 s7, 0xf000
43 ; SI-NEXT: s_mov_b32 s6, -1
44 ; SI-NEXT: s_waitcnt lgkmcnt(0)
45 ; SI-NEXT: s_mul_i32 s0, s0, s1
46 ; SI-NEXT: s_add_i32 s0, s0, s2
47 ; SI-NEXT: s_ashr_i32 s1, s0, 31
48 ; SI-NEXT: v_mov_b32_e32 v0, s0
49 ; SI-NEXT: v_mov_b32_e32 v1, s1
50 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
53 ; VI-LABEL: test_s_sext_i32_to_i64:
54 ; VI: ; %bb.0: ; %entry
55 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
56 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
57 ; VI-NEXT: s_mov_b32 s7, 0xf000
58 ; VI-NEXT: s_mov_b32 s6, -1
59 ; VI-NEXT: s_waitcnt lgkmcnt(0)
60 ; VI-NEXT: s_mul_i32 s0, s0, s1
61 ; VI-NEXT: s_add_i32 s0, s0, s2
62 ; VI-NEXT: s_ashr_i32 s1, s0, 31
63 ; VI-NEXT: v_mov_b32_e32 v0, s0
64 ; VI-NEXT: v_mov_b32_e32 v1, s1
65 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
69 %add = add i32 %mul, %c
70 %sext = sext i32 %add to i64
71 store i64 %sext, i64 addrspace(1)* %out, align 8
75 define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
76 ; SI-LABEL: s_sext_i1_to_i64:
78 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
79 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
80 ; SI-NEXT: s_mov_b32 s7, 0xf000
81 ; SI-NEXT: s_mov_b32 s6, -1
82 ; SI-NEXT: s_waitcnt lgkmcnt(0)
83 ; SI-NEXT: v_mov_b32_e32 v0, s1
84 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
85 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
86 ; SI-NEXT: v_mov_b32_e32 v1, v0
87 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
90 ; VI-LABEL: s_sext_i1_to_i64:
92 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
93 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
94 ; VI-NEXT: s_mov_b32 s7, 0xf000
95 ; VI-NEXT: s_mov_b32 s6, -1
96 ; VI-NEXT: s_waitcnt lgkmcnt(0)
97 ; VI-NEXT: v_mov_b32_e32 v0, s1
98 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
99 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
100 ; VI-NEXT: v_mov_b32_e32 v1, v0
101 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
103 %cmp = icmp eq i32 %a, %b
104 %sext = sext i1 %cmp to i64
105 store i64 %sext, i64 addrspace(1)* %out, align 8
109 define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
110 ; SI-LABEL: s_sext_i32_to_i64:
112 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
113 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
114 ; SI-NEXT: s_mov_b32 s7, 0xf000
115 ; SI-NEXT: s_mov_b32 s6, -1
116 ; SI-NEXT: s_waitcnt lgkmcnt(0)
117 ; SI-NEXT: s_ashr_i32 s1, s0, 31
118 ; SI-NEXT: v_mov_b32_e32 v0, s0
119 ; SI-NEXT: v_mov_b32_e32 v1, s1
120 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
123 ; VI-LABEL: s_sext_i32_to_i64:
125 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
126 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
127 ; VI-NEXT: s_mov_b32 s7, 0xf000
128 ; VI-NEXT: s_mov_b32 s6, -1
129 ; VI-NEXT: s_waitcnt lgkmcnt(0)
130 ; VI-NEXT: s_ashr_i32 s1, s0, 31
131 ; VI-NEXT: v_mov_b32_e32 v0, s0
132 ; VI-NEXT: v_mov_b32_e32 v1, s1
133 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
135 %sext = sext i32 %a to i64
136 store i64 %sext, i64 addrspace(1)* %out, align 8
140 define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
141 ; SI-LABEL: v_sext_i32_to_i64:
143 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
144 ; SI-NEXT: s_mov_b32 s3, 0xf000
145 ; SI-NEXT: s_mov_b32 s2, -1
146 ; SI-NEXT: s_waitcnt lgkmcnt(0)
147 ; SI-NEXT: s_mov_b32 s0, s4
148 ; SI-NEXT: s_mov_b32 s1, s5
149 ; SI-NEXT: s_mov_b32 s4, s6
150 ; SI-NEXT: s_mov_b32 s5, s7
151 ; SI-NEXT: s_mov_b32 s6, s2
152 ; SI-NEXT: s_mov_b32 s7, s3
153 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
154 ; SI-NEXT: s_waitcnt vmcnt(0)
155 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
156 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
159 ; VI-LABEL: v_sext_i32_to_i64:
161 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
162 ; VI-NEXT: s_mov_b32 s3, 0xf000
163 ; VI-NEXT: s_mov_b32 s2, -1
164 ; VI-NEXT: s_waitcnt lgkmcnt(0)
165 ; VI-NEXT: s_mov_b32 s0, s4
166 ; VI-NEXT: s_mov_b32 s1, s5
167 ; VI-NEXT: s_mov_b32 s4, s6
168 ; VI-NEXT: s_mov_b32 s5, s7
169 ; VI-NEXT: s_mov_b32 s6, s2
170 ; VI-NEXT: s_mov_b32 s7, s3
171 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
172 ; VI-NEXT: s_waitcnt vmcnt(0)
173 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
174 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
176 %val = load i32, i32 addrspace(1)* %in, align 4
177 %sext = sext i32 %val to i64
178 store i64 %sext, i64 addrspace(1)* %out, align 8
182 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
183 ; SI-LABEL: s_sext_i16_to_i64:
185 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
186 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
187 ; SI-NEXT: s_mov_b32 s7, 0xf000
188 ; SI-NEXT: s_mov_b32 s6, -1
189 ; SI-NEXT: s_waitcnt lgkmcnt(0)
190 ; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
191 ; SI-NEXT: v_mov_b32_e32 v0, s0
192 ; SI-NEXT: v_mov_b32_e32 v1, s1
193 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
196 ; VI-LABEL: s_sext_i16_to_i64:
198 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
199 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
200 ; VI-NEXT: s_mov_b32 s7, 0xf000
201 ; VI-NEXT: s_mov_b32 s6, -1
202 ; VI-NEXT: s_waitcnt lgkmcnt(0)
203 ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
204 ; VI-NEXT: v_mov_b32_e32 v0, s0
205 ; VI-NEXT: v_mov_b32_e32 v1, s1
206 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
208 %sext = sext i16 %a to i64
209 store i64 %sext, i64 addrspace(1)* %out, align 8
213 define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
214 ; SI-LABEL: s_sext_i1_to_i16:
216 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
217 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
218 ; SI-NEXT: s_mov_b32 s7, 0xf000
219 ; SI-NEXT: s_mov_b32 s6, -1
220 ; SI-NEXT: s_waitcnt lgkmcnt(0)
221 ; SI-NEXT: v_mov_b32_e32 v0, s1
222 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
223 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
224 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
227 ; VI-LABEL: s_sext_i1_to_i16:
229 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
230 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
231 ; VI-NEXT: s_mov_b32 s7, 0xf000
232 ; VI-NEXT: s_mov_b32 s6, -1
233 ; VI-NEXT: s_waitcnt lgkmcnt(0)
234 ; VI-NEXT: v_mov_b32_e32 v0, s1
235 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
236 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
237 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
239 %cmp = icmp eq i32 %a, %b
240 %sext = sext i1 %cmp to i16
241 store i16 %sext, i16 addrspace(1)* %out
245 ; This purpose of this test is to make sure the i16 = sign_extend i1 node
246 ; makes it all the way throught the legalizer/optimizer to make sure
247 ; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node
248 ; is optimized to a select very early.
249 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
250 ; SI-LABEL: s_sext_i1_to_i16_with_and:
252 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
253 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
254 ; SI-NEXT: s_mov_b32 s7, 0xf000
255 ; SI-NEXT: s_mov_b32 s6, -1
256 ; SI-NEXT: s_waitcnt lgkmcnt(0)
257 ; SI-NEXT: v_mov_b32_e32 v0, s1
258 ; SI-NEXT: v_mov_b32_e32 v1, s3
259 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
260 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
261 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
262 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
263 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
266 ; VI-LABEL: s_sext_i1_to_i16_with_and:
268 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
269 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
270 ; VI-NEXT: s_mov_b32 s7, 0xf000
271 ; VI-NEXT: s_mov_b32 s6, -1
272 ; VI-NEXT: s_waitcnt lgkmcnt(0)
273 ; VI-NEXT: v_mov_b32_e32 v0, s1
274 ; VI-NEXT: v_mov_b32_e32 v1, s3
275 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
276 ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
277 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
278 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
279 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
281 %cmp0 = icmp eq i32 %a, %b
282 %cmp1 = icmp eq i32 %c, %d
283 %cmp = and i1 %cmp0, %cmp1
284 %sext = sext i1 %cmp to i16
285 store i16 %sext, i16 addrspace(1)* %out
289 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
290 ; SI-LABEL: v_sext_i1_to_i16_with_and:
292 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
293 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
294 ; SI-NEXT: s_mov_b32 s7, 0xf000
295 ; SI-NEXT: s_mov_b32 s6, -1
296 ; SI-NEXT: s_waitcnt lgkmcnt(0)
297 ; SI-NEXT: v_mov_b32_e32 v1, s2
298 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
299 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
300 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
301 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
302 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
305 ; VI-LABEL: v_sext_i1_to_i16_with_and:
307 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
308 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
309 ; VI-NEXT: s_mov_b32 s7, 0xf000
310 ; VI-NEXT: s_mov_b32 s6, -1
311 ; VI-NEXT: s_waitcnt lgkmcnt(0)
312 ; VI-NEXT: v_mov_b32_e32 v1, s2
313 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
314 ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
315 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
316 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
317 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
319 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
320 %cmp0 = icmp eq i32 %a, %tid
321 %cmp1 = icmp eq i32 %b, %c
322 %cmp = and i1 %cmp0, %cmp1
323 %sext = sext i1 %cmp to i16
324 store i16 %sext, i16 addrspace(1)* %out
328 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
329 ; gets selected to a v_lshrrev_b16 instructions, so the input to
330 ; the bfe is a vector registers. To fix this we need to be able to
332 ; t29: i16 = truncate t10
333 ; t55: i16 = srl t29, Constant:i32<8>
334 ; t63: i32 = any_extend t55
335 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
336 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
337 ; SI-LABEL: s_sext_v4i8_to_v4i32:
339 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
340 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
341 ; SI-NEXT: s_mov_b32 s7, 0xf000
342 ; SI-NEXT: s_mov_b32 s6, -1
343 ; SI-NEXT: s_waitcnt lgkmcnt(0)
344 ; SI-NEXT: s_ashr_i32 s1, s0, 24
345 ; SI-NEXT: s_bfe_i32 s2, s0, 0x80010
346 ; SI-NEXT: s_bfe_i32 s3, s0, 0x80008
347 ; SI-NEXT: s_sext_i32_i8 s0, s0
348 ; SI-NEXT: v_mov_b32_e32 v0, s0
349 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
350 ; SI-NEXT: s_waitcnt expcnt(0)
351 ; SI-NEXT: v_mov_b32_e32 v0, s3
352 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
353 ; SI-NEXT: s_waitcnt expcnt(0)
354 ; SI-NEXT: v_mov_b32_e32 v0, s2
355 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
356 ; SI-NEXT: s_waitcnt expcnt(0)
357 ; SI-NEXT: v_mov_b32_e32 v0, s1
358 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
361 ; VI-LABEL: s_sext_v4i8_to_v4i32:
363 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
364 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
365 ; VI-NEXT: s_mov_b32 s7, 0xf000
366 ; VI-NEXT: s_mov_b32 s6, -1
367 ; VI-NEXT: s_waitcnt lgkmcnt(0)
368 ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0
369 ; VI-NEXT: s_ashr_i32 s1, s0, 24
370 ; VI-NEXT: s_bfe_i32 s2, s0, 0x80010
371 ; VI-NEXT: s_sext_i32_i8 s0, s0
372 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
373 ; VI-NEXT: v_mov_b32_e32 v1, s0
374 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
375 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
376 ; VI-NEXT: v_mov_b32_e32 v0, s2
377 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
378 ; VI-NEXT: v_mov_b32_e32 v0, s1
379 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
381 %cast = bitcast i32 %a to <4 x i8>
382 %ext = sext <4 x i8> %cast to <4 x i32>
383 %elt0 = extractelement <4 x i32> %ext, i32 0
384 %elt1 = extractelement <4 x i32> %ext, i32 1
385 %elt2 = extractelement <4 x i32> %ext, i32 2
386 %elt3 = extractelement <4 x i32> %ext, i32 3
387 store volatile i32 %elt0, i32 addrspace(1)* %out
388 store volatile i32 %elt1, i32 addrspace(1)* %out
389 store volatile i32 %elt2, i32 addrspace(1)* %out
390 store volatile i32 %elt3, i32 addrspace(1)* %out
394 ; FIXME: need to optimize same sequence as above test to avoid
396 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
397 ; SI-LABEL: v_sext_v4i8_to_v4i32:
399 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
400 ; SI-NEXT: s_mov_b32 s3, 0xf000
401 ; SI-NEXT: s_mov_b32 s2, -1
402 ; SI-NEXT: s_waitcnt lgkmcnt(0)
403 ; SI-NEXT: s_mov_b32 s0, s4
404 ; SI-NEXT: s_mov_b32 s1, s5
405 ; SI-NEXT: s_mov_b32 s4, s6
406 ; SI-NEXT: s_mov_b32 s5, s7
407 ; SI-NEXT: s_mov_b32 s6, s2
408 ; SI-NEXT: s_mov_b32 s7, s3
409 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
410 ; SI-NEXT: s_waitcnt vmcnt(0)
411 ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
412 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
413 ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
414 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
415 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
416 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
417 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
418 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
421 ; VI-LABEL: v_sext_v4i8_to_v4i32:
423 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
424 ; VI-NEXT: s_mov_b32 s3, 0xf000
425 ; VI-NEXT: s_mov_b32 s2, -1
426 ; VI-NEXT: s_waitcnt lgkmcnt(0)
427 ; VI-NEXT: s_mov_b32 s0, s4
428 ; VI-NEXT: s_mov_b32 s1, s5
429 ; VI-NEXT: s_mov_b32 s4, s6
430 ; VI-NEXT: s_mov_b32 s5, s7
431 ; VI-NEXT: s_mov_b32 s6, s2
432 ; VI-NEXT: s_mov_b32 s7, s3
433 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
434 ; VI-NEXT: s_waitcnt vmcnt(0)
435 ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
436 ; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
437 ; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
438 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
439 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
440 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
441 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
442 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
443 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
445 %a = load i32, i32 addrspace(1)* %in
446 %cast = bitcast i32 %a to <4 x i8>
447 %ext = sext <4 x i8> %cast to <4 x i32>
448 %elt0 = extractelement <4 x i32> %ext, i32 0
449 %elt1 = extractelement <4 x i32> %ext, i32 1
450 %elt2 = extractelement <4 x i32> %ext, i32 2
451 %elt3 = extractelement <4 x i32> %ext, i32 3
452 store volatile i32 %elt0, i32 addrspace(1)* %out
453 store volatile i32 %elt1, i32 addrspace(1)* %out
454 store volatile i32 %elt2, i32 addrspace(1)* %out
455 store volatile i32 %elt3, i32 addrspace(1)* %out
459 ; FIXME: s_bfe_i64, same on SI and VI
460 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
461 ; SI-LABEL: s_sext_v4i16_to_v4i32:
463 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
464 ; SI-NEXT: s_mov_b32 s3, 0xf000
465 ; SI-NEXT: s_mov_b32 s2, -1
466 ; SI-NEXT: s_waitcnt lgkmcnt(0)
467 ; SI-NEXT: s_mov_b32 s0, s4
468 ; SI-NEXT: s_mov_b32 s1, s5
469 ; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 48
470 ; SI-NEXT: s_ashr_i32 s5, s6, 16
471 ; SI-NEXT: s_sext_i32_i16 s6, s6
472 ; SI-NEXT: v_mov_b32_e32 v0, s6
473 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
474 ; SI-NEXT: s_waitcnt expcnt(0)
475 ; SI-NEXT: v_mov_b32_e32 v0, s5
476 ; SI-NEXT: s_sext_i32_i16 s7, s7
477 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
478 ; SI-NEXT: s_waitcnt expcnt(0)
479 ; SI-NEXT: v_mov_b32_e32 v0, s7
480 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
481 ; SI-NEXT: s_waitcnt expcnt(0)
482 ; SI-NEXT: v_mov_b32_e32 v0, s4
483 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
486 ; VI-LABEL: s_sext_v4i16_to_v4i32:
488 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
489 ; VI-NEXT: s_mov_b32 s3, 0xf000
490 ; VI-NEXT: s_mov_b32 s2, -1
491 ; VI-NEXT: s_waitcnt lgkmcnt(0)
492 ; VI-NEXT: s_mov_b32 s1, s5
493 ; VI-NEXT: s_ashr_i32 s5, s6, 16
494 ; VI-NEXT: s_sext_i32_i16 s6, s6
495 ; VI-NEXT: s_mov_b32 s0, s4
496 ; VI-NEXT: v_mov_b32_e32 v0, s6
497 ; VI-NEXT: s_ashr_i32 s4, s7, 16
498 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
499 ; VI-NEXT: v_mov_b32_e32 v0, s5
500 ; VI-NEXT: s_sext_i32_i16 s7, s7
501 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
502 ; VI-NEXT: v_mov_b32_e32 v0, s7
503 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
504 ; VI-NEXT: v_mov_b32_e32 v0, s4
505 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
507 %cast = bitcast i64 %a to <4 x i16>
508 %ext = sext <4 x i16> %cast to <4 x i32>
509 %elt0 = extractelement <4 x i32> %ext, i32 0
510 %elt1 = extractelement <4 x i32> %ext, i32 1
511 %elt2 = extractelement <4 x i32> %ext, i32 2
512 %elt3 = extractelement <4 x i32> %ext, i32 3
513 store volatile i32 %elt0, i32 addrspace(1)* %out
514 store volatile i32 %elt1, i32 addrspace(1)* %out
515 store volatile i32 %elt2, i32 addrspace(1)* %out
516 store volatile i32 %elt3, i32 addrspace(1)* %out
520 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
521 ; SI-LABEL: v_sext_v4i16_to_v4i32:
523 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
524 ; SI-NEXT: s_mov_b32 s3, 0xf000
525 ; SI-NEXT: s_mov_b32 s2, -1
526 ; SI-NEXT: s_waitcnt lgkmcnt(0)
527 ; SI-NEXT: s_mov_b32 s0, s4
528 ; SI-NEXT: s_mov_b32 s1, s5
529 ; SI-NEXT: s_mov_b32 s4, s6
530 ; SI-NEXT: s_mov_b32 s5, s7
531 ; SI-NEXT: s_mov_b32 s6, s2
532 ; SI-NEXT: s_mov_b32 s7, s3
533 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
534 ; SI-NEXT: s_waitcnt vmcnt(0)
535 ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
536 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
537 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
538 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
539 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
540 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
541 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
542 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
545 ; VI-LABEL: v_sext_v4i16_to_v4i32:
547 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
548 ; VI-NEXT: s_mov_b32 s3, 0xf000
549 ; VI-NEXT: s_mov_b32 s2, -1
550 ; VI-NEXT: s_waitcnt lgkmcnt(0)
551 ; VI-NEXT: s_mov_b32 s0, s4
552 ; VI-NEXT: s_mov_b32 s1, s5
553 ; VI-NEXT: s_mov_b32 s4, s6
554 ; VI-NEXT: s_mov_b32 s5, s7
555 ; VI-NEXT: s_mov_b32 s6, s2
556 ; VI-NEXT: s_mov_b32 s7, s3
557 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
558 ; VI-NEXT: s_waitcnt vmcnt(0)
559 ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
560 ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
561 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
562 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
563 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
564 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
565 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
566 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
568 %a = load i64, i64 addrspace(1)* %in
569 %cast = bitcast i64 %a to <4 x i16>
570 %ext = sext <4 x i16> %cast to <4 x i32>
571 %elt0 = extractelement <4 x i32> %ext, i32 0
572 %elt1 = extractelement <4 x i32> %ext, i32 1
573 %elt2 = extractelement <4 x i32> %ext, i32 2
574 %elt3 = extractelement <4 x i32> %ext, i32 3
575 store volatile i32 %elt0, i32 addrspace(1)* %out
576 store volatile i32 %elt1, i32 addrspace(1)* %out
577 store volatile i32 %elt2, i32 addrspace(1)* %out
578 store volatile i32 %elt3, i32 addrspace(1)* %out
582 declare i32 @llvm.amdgcn.workitem.id.x() #1
584 attributes #1 = { nounwind readnone }