1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
3 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
5 define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
6 ; SI-LABEL: s_sext_i1_to_i32:
8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
9 ; SI-NEXT: s_mov_b32 s7, 0xf000
10 ; SI-NEXT: s_mov_b32 s6, -1
11 ; SI-NEXT: s_waitcnt lgkmcnt(0)
12 ; SI-NEXT: s_cmp_eq_u32 s2, s3
13 ; SI-NEXT: s_mov_b32 s4, s0
14 ; SI-NEXT: s_mov_b32 s5, s1
15 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
16 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
17 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
20 ; VI-LABEL: s_sext_i1_to_i32:
22 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
23 ; VI-NEXT: s_mov_b32 s7, 0xf000
24 ; VI-NEXT: s_mov_b32 s6, -1
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: s_cmp_eq_u32 s2, s3
27 ; VI-NEXT: s_mov_b32 s4, s0
28 ; VI-NEXT: s_mov_b32 s5, s1
29 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
30 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
31 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
33 %cmp = icmp eq i32 %a, %b
34 %sext = sext i1 %cmp to i32
35 store i32 %sext, ptr addrspace(1) %out, align 4
39 define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
40 ; SI-LABEL: test_s_sext_i32_to_i64:
41 ; SI: ; %bb.0: ; %entry
42 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
43 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
44 ; SI-NEXT: s_mov_b32 s7, 0xf000
45 ; SI-NEXT: s_mov_b32 s6, -1
46 ; SI-NEXT: s_waitcnt lgkmcnt(0)
47 ; SI-NEXT: s_mul_i32 s0, s0, s1
48 ; SI-NEXT: s_add_i32 s0, s0, s2
49 ; SI-NEXT: s_ashr_i32 s1, s0, 31
50 ; SI-NEXT: v_mov_b32_e32 v0, s0
51 ; SI-NEXT: v_mov_b32_e32 v1, s1
52 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
55 ; VI-LABEL: test_s_sext_i32_to_i64:
56 ; VI: ; %bb.0: ; %entry
57 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
58 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
59 ; VI-NEXT: s_mov_b32 s7, 0xf000
60 ; VI-NEXT: s_mov_b32 s6, -1
61 ; VI-NEXT: s_waitcnt lgkmcnt(0)
62 ; VI-NEXT: s_mul_i32 s0, s0, s1
63 ; VI-NEXT: s_add_i32 s0, s0, s2
64 ; VI-NEXT: s_ashr_i32 s1, s0, 31
65 ; VI-NEXT: v_mov_b32_e32 v0, s0
66 ; VI-NEXT: v_mov_b32_e32 v1, s1
67 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
71 %add = add i32 %mul, %c
72 %sext = sext i32 %add to i64
73 store i64 %sext, ptr addrspace(1) %out, align 8
77 define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
78 ; SI-LABEL: s_sext_i1_to_i64:
80 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
81 ; SI-NEXT: s_mov_b32 s7, 0xf000
82 ; SI-NEXT: s_mov_b32 s6, -1
83 ; SI-NEXT: s_waitcnt lgkmcnt(0)
84 ; SI-NEXT: s_cmp_eq_u32 s2, s3
85 ; SI-NEXT: s_mov_b32 s4, s0
86 ; SI-NEXT: s_mov_b32 s5, s1
87 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
88 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
89 ; SI-NEXT: v_mov_b32_e32 v1, v0
90 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
93 ; VI-LABEL: s_sext_i1_to_i64:
95 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
96 ; VI-NEXT: s_mov_b32 s7, 0xf000
97 ; VI-NEXT: s_mov_b32 s6, -1
98 ; VI-NEXT: s_waitcnt lgkmcnt(0)
99 ; VI-NEXT: s_cmp_eq_u32 s2, s3
100 ; VI-NEXT: s_mov_b32 s4, s0
101 ; VI-NEXT: s_mov_b32 s5, s1
102 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
103 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
104 ; VI-NEXT: v_mov_b32_e32 v1, v0
105 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
107 %cmp = icmp eq i32 %a, %b
108 %sext = sext i1 %cmp to i64
109 store i64 %sext, ptr addrspace(1) %out, align 8
113 define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind {
114 ; SI-LABEL: s_sext_i32_to_i64:
116 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
117 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
118 ; SI-NEXT: s_mov_b32 s3, 0xf000
119 ; SI-NEXT: s_mov_b32 s2, -1
120 ; SI-NEXT: s_waitcnt lgkmcnt(0)
121 ; SI-NEXT: s_ashr_i32 s4, s6, 31
122 ; SI-NEXT: v_mov_b32_e32 v0, s6
123 ; SI-NEXT: v_mov_b32_e32 v1, s4
124 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
127 ; VI-LABEL: s_sext_i32_to_i64:
129 ; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
130 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
131 ; VI-NEXT: s_mov_b32 s3, 0xf000
132 ; VI-NEXT: s_mov_b32 s2, -1
133 ; VI-NEXT: s_waitcnt lgkmcnt(0)
134 ; VI-NEXT: s_ashr_i32 s4, s6, 31
135 ; VI-NEXT: v_mov_b32_e32 v0, s6
136 ; VI-NEXT: v_mov_b32_e32 v1, s4
137 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
139 %sext = sext i32 %a to i64
140 store i64 %sext, ptr addrspace(1) %out, align 8
144 define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
145 ; SI-LABEL: v_sext_i32_to_i64:
147 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
148 ; SI-NEXT: s_mov_b32 s7, 0xf000
149 ; SI-NEXT: s_mov_b32 s6, -1
150 ; SI-NEXT: s_mov_b32 s10, s6
151 ; SI-NEXT: s_mov_b32 s11, s7
152 ; SI-NEXT: s_waitcnt lgkmcnt(0)
153 ; SI-NEXT: s_mov_b32 s8, s2
154 ; SI-NEXT: s_mov_b32 s9, s3
155 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
156 ; SI-NEXT: s_mov_b32 s4, s0
157 ; SI-NEXT: s_mov_b32 s5, s1
158 ; SI-NEXT: s_waitcnt vmcnt(0)
159 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
160 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
163 ; VI-LABEL: v_sext_i32_to_i64:
165 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
166 ; VI-NEXT: s_mov_b32 s7, 0xf000
167 ; VI-NEXT: s_mov_b32 s6, -1
168 ; VI-NEXT: s_mov_b32 s10, s6
169 ; VI-NEXT: s_mov_b32 s11, s7
170 ; VI-NEXT: s_waitcnt lgkmcnt(0)
171 ; VI-NEXT: s_mov_b32 s8, s2
172 ; VI-NEXT: s_mov_b32 s9, s3
173 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
174 ; VI-NEXT: s_mov_b32 s4, s0
175 ; VI-NEXT: s_mov_b32 s5, s1
176 ; VI-NEXT: s_waitcnt vmcnt(0)
177 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
178 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
180 %val = load i32, ptr addrspace(1) %in, align 4
181 %sext = sext i32 %val to i64
182 store i64 %sext, ptr addrspace(1) %out, align 8
186 define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind {
187 ; SI-LABEL: s_sext_i16_to_i64:
189 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
190 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
191 ; SI-NEXT: s_mov_b32 s3, 0xf000
192 ; SI-NEXT: s_mov_b32 s2, -1
193 ; SI-NEXT: s_waitcnt lgkmcnt(0)
194 ; SI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000
195 ; SI-NEXT: v_mov_b32_e32 v0, s4
196 ; SI-NEXT: v_mov_b32_e32 v1, s5
197 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
200 ; VI-LABEL: s_sext_i16_to_i64:
202 ; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
203 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
204 ; VI-NEXT: s_mov_b32 s3, 0xf000
205 ; VI-NEXT: s_mov_b32 s2, -1
206 ; VI-NEXT: s_waitcnt lgkmcnt(0)
207 ; VI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000
208 ; VI-NEXT: v_mov_b32_e32 v0, s4
209 ; VI-NEXT: v_mov_b32_e32 v1, s5
210 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
212 %sext = sext i16 %a to i64
213 store i64 %sext, ptr addrspace(1) %out, align 8
217 define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
218 ; SI-LABEL: s_sext_i1_to_i16:
220 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
221 ; SI-NEXT: s_mov_b32 s7, 0xf000
222 ; SI-NEXT: s_mov_b32 s6, -1
223 ; SI-NEXT: s_waitcnt lgkmcnt(0)
224 ; SI-NEXT: s_cmp_eq_u32 s2, s3
225 ; SI-NEXT: s_mov_b32 s4, s0
226 ; SI-NEXT: s_mov_b32 s5, s1
227 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
228 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
229 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
232 ; VI-LABEL: s_sext_i1_to_i16:
234 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
235 ; VI-NEXT: s_mov_b32 s7, 0xf000
236 ; VI-NEXT: s_mov_b32 s6, -1
237 ; VI-NEXT: s_waitcnt lgkmcnt(0)
238 ; VI-NEXT: s_cmp_eq_u32 s2, s3
239 ; VI-NEXT: s_mov_b32 s4, s0
240 ; VI-NEXT: s_mov_b32 s5, s1
241 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
242 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
243 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
245 %cmp = icmp eq i32 %a, %b
246 %sext = sext i1 %cmp to i16
247 store i16 %sext, ptr addrspace(1) %out
251 ; This purpose of this test is to make sure the i16 = sign_extend i1 node
252 ; makes it all the way throught the legalizer/optimizer to make sure
253 ; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node
254 ; is optimized to a select very early.
255 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
256 ; SI-LABEL: s_sext_i1_to_i16_with_and:
258 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
259 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
260 ; SI-NEXT: s_mov_b32 s7, 0xf000
261 ; SI-NEXT: s_mov_b32 s6, -1
262 ; SI-NEXT: s_waitcnt lgkmcnt(0)
263 ; SI-NEXT: s_cmp_eq_u32 s0, s1
264 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
265 ; SI-NEXT: s_cmp_eq_u32 s2, s3
266 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
267 ; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
268 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
269 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
272 ; VI-LABEL: s_sext_i1_to_i16_with_and:
274 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
275 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
276 ; VI-NEXT: s_mov_b32 s7, 0xf000
277 ; VI-NEXT: s_mov_b32 s6, -1
278 ; VI-NEXT: s_waitcnt lgkmcnt(0)
279 ; VI-NEXT: s_cmp_eq_u32 s0, s1
280 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
281 ; VI-NEXT: s_cmp_eq_u32 s2, s3
282 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
283 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
284 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
285 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
287 %cmp0 = icmp eq i32 %a, %b
288 %cmp1 = icmp eq i32 %c, %d
289 %cmp = and i1 %cmp0, %cmp1
290 %sext = sext i1 %cmp to i16
291 store i16 %sext, ptr addrspace(1) %out
295 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
296 ; SI-LABEL: v_sext_i1_to_i16_with_and:
298 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
299 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
300 ; SI-NEXT: s_mov_b32 s7, 0xf000
301 ; SI-NEXT: s_mov_b32 s6, -1
302 ; SI-NEXT: s_waitcnt lgkmcnt(0)
303 ; SI-NEXT: s_cmp_eq_u32 s1, s2
304 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
305 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
306 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
307 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
308 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
311 ; VI-LABEL: v_sext_i1_to_i16_with_and:
313 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
314 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
315 ; VI-NEXT: s_mov_b32 s7, 0xf000
316 ; VI-NEXT: s_mov_b32 s6, -1
317 ; VI-NEXT: s_waitcnt lgkmcnt(0)
318 ; VI-NEXT: s_cmp_eq_u32 s1, s2
319 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
320 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
321 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
322 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
323 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
325 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
326 %cmp0 = icmp eq i32 %a, %tid
327 %cmp1 = icmp eq i32 %b, %c
328 %cmp = and i1 %cmp0, %cmp1
329 %sext = sext i1 %cmp to i16
330 store i16 %sext, ptr addrspace(1) %out
334 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
335 ; gets selected to a v_lshrrev_b16 instructions, so the input to
336 ; the bfe is a vector registers. To fix this we need to be able to
338 ; t29: i16 = truncate t10
339 ; t55: i16 = srl t29, Constant:i32<8>
340 ; t63: i32 = any_extend t55
341 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
342 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind {
343 ; SI-LABEL: s_sext_v4i8_to_v4i32:
345 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
346 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
347 ; SI-NEXT: s_mov_b32 s3, 0xf000
348 ; SI-NEXT: s_mov_b32 s2, -1
349 ; SI-NEXT: s_waitcnt lgkmcnt(0)
350 ; SI-NEXT: s_ashr_i32 s4, s6, 24
351 ; SI-NEXT: s_bfe_i32 s5, s6, 0x80010
352 ; SI-NEXT: s_bfe_i32 s7, s6, 0x80008
353 ; SI-NEXT: s_sext_i32_i8 s6, s6
354 ; SI-NEXT: v_mov_b32_e32 v0, s6
355 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
356 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
357 ; SI-NEXT: v_mov_b32_e32 v0, s7
358 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
359 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
360 ; SI-NEXT: v_mov_b32_e32 v0, s5
361 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
362 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
363 ; SI-NEXT: v_mov_b32_e32 v0, s4
364 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
365 ; SI-NEXT: s_waitcnt vmcnt(0)
368 ; VI-LABEL: s_sext_v4i8_to_v4i32:
370 ; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
371 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
372 ; VI-NEXT: s_mov_b32 s3, 0xf000
373 ; VI-NEXT: s_mov_b32 s2, -1
374 ; VI-NEXT: s_waitcnt lgkmcnt(0)
375 ; VI-NEXT: s_ashr_i32 s4, s6, 24
376 ; VI-NEXT: s_bfe_i32 s5, s6, 0x80010
377 ; VI-NEXT: s_bfe_i32 s7, s6, 0x80008
378 ; VI-NEXT: s_sext_i32_i8 s6, s6
379 ; VI-NEXT: v_mov_b32_e32 v0, s6
380 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
381 ; VI-NEXT: s_waitcnt vmcnt(0)
382 ; VI-NEXT: v_mov_b32_e32 v0, s7
383 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
384 ; VI-NEXT: s_waitcnt vmcnt(0)
385 ; VI-NEXT: v_mov_b32_e32 v0, s5
386 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
387 ; VI-NEXT: s_waitcnt vmcnt(0)
388 ; VI-NEXT: v_mov_b32_e32 v0, s4
389 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
390 ; VI-NEXT: s_waitcnt vmcnt(0)
392 %cast = bitcast i32 %a to <4 x i8>
393 %ext = sext <4 x i8> %cast to <4 x i32>
394 %elt0 = extractelement <4 x i32> %ext, i32 0
395 %elt1 = extractelement <4 x i32> %ext, i32 1
396 %elt2 = extractelement <4 x i32> %ext, i32 2
397 %elt3 = extractelement <4 x i32> %ext, i32 3
398 store volatile i32 %elt0, ptr addrspace(1) %out
399 store volatile i32 %elt1, ptr addrspace(1) %out
400 store volatile i32 %elt2, ptr addrspace(1) %out
401 store volatile i32 %elt3, ptr addrspace(1) %out
405 ; FIXME: need to optimize same sequence as above test to avoid
407 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
408 ; SI-LABEL: v_sext_v4i8_to_v4i32:
410 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
411 ; SI-NEXT: s_mov_b32 s7, 0xf000
412 ; SI-NEXT: s_mov_b32 s6, -1
413 ; SI-NEXT: s_mov_b32 s10, s6
414 ; SI-NEXT: s_mov_b32 s11, s7
415 ; SI-NEXT: s_waitcnt lgkmcnt(0)
416 ; SI-NEXT: s_mov_b32 s8, s2
417 ; SI-NEXT: s_mov_b32 s9, s3
418 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
419 ; SI-NEXT: s_mov_b32 s4, s0
420 ; SI-NEXT: s_mov_b32 s5, s1
421 ; SI-NEXT: s_waitcnt vmcnt(0)
422 ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
423 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
424 ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
425 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
426 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
427 ; SI-NEXT: s_waitcnt vmcnt(0)
428 ; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
429 ; SI-NEXT: s_waitcnt vmcnt(0)
430 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
431 ; SI-NEXT: s_waitcnt vmcnt(0)
432 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
433 ; SI-NEXT: s_waitcnt vmcnt(0)
436 ; VI-LABEL: v_sext_v4i8_to_v4i32:
438 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
439 ; VI-NEXT: s_mov_b32 s7, 0xf000
440 ; VI-NEXT: s_mov_b32 s6, -1
441 ; VI-NEXT: s_mov_b32 s10, s6
442 ; VI-NEXT: s_mov_b32 s11, s7
443 ; VI-NEXT: s_waitcnt lgkmcnt(0)
444 ; VI-NEXT: s_mov_b32 s8, s2
445 ; VI-NEXT: s_mov_b32 s9, s3
446 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
447 ; VI-NEXT: s_mov_b32 s4, s0
448 ; VI-NEXT: s_mov_b32 s5, s1
449 ; VI-NEXT: s_waitcnt vmcnt(0)
450 ; VI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
451 ; VI-NEXT: v_bfe_i32 v2, v0, 16, 8
452 ; VI-NEXT: v_bfe_i32 v3, v0, 8, 8
453 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
454 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
455 ; VI-NEXT: s_waitcnt vmcnt(0)
456 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
457 ; VI-NEXT: s_waitcnt vmcnt(0)
458 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
459 ; VI-NEXT: s_waitcnt vmcnt(0)
460 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
461 ; VI-NEXT: s_waitcnt vmcnt(0)
463 %a = load i32, ptr addrspace(1) %in
464 %cast = bitcast i32 %a to <4 x i8>
465 %ext = sext <4 x i8> %cast to <4 x i32>
466 %elt0 = extractelement <4 x i32> %ext, i32 0
467 %elt1 = extractelement <4 x i32> %ext, i32 1
468 %elt2 = extractelement <4 x i32> %ext, i32 2
469 %elt3 = extractelement <4 x i32> %ext, i32 3
470 store volatile i32 %elt0, ptr addrspace(1) %out
471 store volatile i32 %elt1, ptr addrspace(1) %out
472 store volatile i32 %elt2, ptr addrspace(1) %out
473 store volatile i32 %elt3, ptr addrspace(1) %out
477 ; FIXME: s_bfe_i64, same on SI and VI
478 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind {
479 ; SI-LABEL: s_sext_v4i16_to_v4i32:
481 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
482 ; SI-NEXT: s_mov_b32 s7, 0xf000
483 ; SI-NEXT: s_mov_b32 s6, -1
484 ; SI-NEXT: s_waitcnt lgkmcnt(0)
485 ; SI-NEXT: s_mov_b32 s4, s0
486 ; SI-NEXT: s_mov_b32 s5, s1
487 ; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48
488 ; SI-NEXT: s_ashr_i32 s1, s2, 16
489 ; SI-NEXT: s_sext_i32_i16 s2, s2
490 ; SI-NEXT: v_mov_b32_e32 v0, s2
491 ; SI-NEXT: s_sext_i32_i16 s3, s3
492 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
493 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
494 ; SI-NEXT: v_mov_b32_e32 v0, s1
495 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
496 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
497 ; SI-NEXT: v_mov_b32_e32 v0, s3
498 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
499 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
500 ; SI-NEXT: v_mov_b32_e32 v0, s0
501 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
502 ; SI-NEXT: s_waitcnt vmcnt(0)
505 ; VI-LABEL: s_sext_v4i16_to_v4i32:
507 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
508 ; VI-NEXT: s_mov_b32 s7, 0xf000
509 ; VI-NEXT: s_mov_b32 s6, -1
510 ; VI-NEXT: s_waitcnt lgkmcnt(0)
511 ; VI-NEXT: s_mov_b32 s5, s1
512 ; VI-NEXT: s_ashr_i32 s1, s2, 16
513 ; VI-NEXT: s_sext_i32_i16 s2, s2
514 ; VI-NEXT: s_mov_b32 s4, s0
515 ; VI-NEXT: v_mov_b32_e32 v0, s2
516 ; VI-NEXT: s_ashr_i32 s0, s3, 16
517 ; VI-NEXT: s_sext_i32_i16 s3, s3
518 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
519 ; VI-NEXT: s_waitcnt vmcnt(0)
520 ; VI-NEXT: v_mov_b32_e32 v0, s1
521 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
522 ; VI-NEXT: s_waitcnt vmcnt(0)
523 ; VI-NEXT: v_mov_b32_e32 v0, s3
524 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
525 ; VI-NEXT: s_waitcnt vmcnt(0)
526 ; VI-NEXT: v_mov_b32_e32 v0, s0
527 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
528 ; VI-NEXT: s_waitcnt vmcnt(0)
530 %cast = bitcast i64 %a to <4 x i16>
531 %ext = sext <4 x i16> %cast to <4 x i32>
532 %elt0 = extractelement <4 x i32> %ext, i32 0
533 %elt1 = extractelement <4 x i32> %ext, i32 1
534 %elt2 = extractelement <4 x i32> %ext, i32 2
535 %elt3 = extractelement <4 x i32> %ext, i32 3
536 store volatile i32 %elt0, ptr addrspace(1) %out
537 store volatile i32 %elt1, ptr addrspace(1) %out
538 store volatile i32 %elt2, ptr addrspace(1) %out
539 store volatile i32 %elt3, ptr addrspace(1) %out
543 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
544 ; SI-LABEL: v_sext_v4i16_to_v4i32:
546 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
547 ; SI-NEXT: s_mov_b32 s7, 0xf000
548 ; SI-NEXT: s_mov_b32 s6, -1
549 ; SI-NEXT: s_mov_b32 s10, s6
550 ; SI-NEXT: s_mov_b32 s11, s7
551 ; SI-NEXT: s_waitcnt lgkmcnt(0)
552 ; SI-NEXT: s_mov_b32 s8, s2
553 ; SI-NEXT: s_mov_b32 s9, s3
554 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
555 ; SI-NEXT: s_mov_b32 s4, s0
556 ; SI-NEXT: s_mov_b32 s5, s1
557 ; SI-NEXT: s_waitcnt vmcnt(0)
558 ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
559 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
560 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
561 ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
562 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
563 ; SI-NEXT: s_waitcnt vmcnt(0)
564 ; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
565 ; SI-NEXT: s_waitcnt vmcnt(0)
566 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
567 ; SI-NEXT: s_waitcnt vmcnt(0)
568 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
569 ; SI-NEXT: s_waitcnt vmcnt(0)
572 ; VI-LABEL: v_sext_v4i16_to_v4i32:
574 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
575 ; VI-NEXT: s_mov_b32 s7, 0xf000
576 ; VI-NEXT: s_mov_b32 s6, -1
577 ; VI-NEXT: s_mov_b32 s10, s6
578 ; VI-NEXT: s_mov_b32 s11, s7
579 ; VI-NEXT: s_waitcnt lgkmcnt(0)
580 ; VI-NEXT: s_mov_b32 s8, s2
581 ; VI-NEXT: s_mov_b32 s9, s3
582 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
583 ; VI-NEXT: s_mov_b32 s4, s0
584 ; VI-NEXT: s_mov_b32 s5, s1
585 ; VI-NEXT: s_waitcnt vmcnt(0)
586 ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
587 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
588 ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
589 ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
590 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
591 ; VI-NEXT: s_waitcnt vmcnt(0)
592 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
593 ; VI-NEXT: s_waitcnt vmcnt(0)
594 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
595 ; VI-NEXT: s_waitcnt vmcnt(0)
596 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
597 ; VI-NEXT: s_waitcnt vmcnt(0)
599 %a = load i64, ptr addrspace(1) %in
600 %cast = bitcast i64 %a to <4 x i16>
601 %ext = sext <4 x i16> %cast to <4 x i32>
602 %elt0 = extractelement <4 x i32> %ext, i32 0
603 %elt1 = extractelement <4 x i32> %ext, i32 1
604 %elt2 = extractelement <4 x i32> %ext, i32 2
605 %elt3 = extractelement <4 x i32> %ext, i32 3
606 store volatile i32 %elt0, ptr addrspace(1) %out
607 store volatile i32 %elt1, ptr addrspace(1) %out
608 store volatile i32 %elt2, ptr addrspace(1) %out
609 store volatile i32 %elt3, ptr addrspace(1) %out
613 declare i32 @llvm.amdgcn.workitem.id.x() #1
615 attributes #1 = { nounwind readnone }