1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s
4 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s
6 ; RUN: llc < %s -mtriple=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s
8 define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
11 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
12 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
13 ; SI-NEXT: s_mov_b32 s3, 0xf000
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_and_b32 s4, s2, 0xff
16 ; SI-NEXT: s_mov_b32 s2, -1
17 ; SI-NEXT: v_mov_b32_e32 v0, s4
18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
23 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: s_and_b32 s2, s2, 0xff
27 ; VI-NEXT: v_mov_b32_e32 v0, s0
28 ; VI-NEXT: v_mov_b32_e32 v1, s1
29 ; VI-NEXT: v_mov_b32_e32 v2, s2
30 ; VI-NEXT: flat_store_dword v[0:1], v2
35 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
36 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
37 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
38 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff
40 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
41 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
46 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
48 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
49 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
52 ; EG-NEXT: Fetch clause starting at 6:
53 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
54 ; EG-NEXT: ALU clause starting at 8:
55 ; EG-NEXT: MOV * T0.X, 0.0,
56 ; EG-NEXT: ALU clause starting at 9:
57 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
58 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
62 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
64 ; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
65 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
68 ; CM-NEXT: Fetch clause starting at 6:
69 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
70 ; CM-NEXT: ALU clause starting at 8:
71 ; CM-NEXT: MOV * T0.X, 0.0,
72 ; CM-NEXT: ALU clause starting at 9:
73 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
74 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
75 %ext = zext i8 %in to i32
76 store i32 %ext, ptr addrspace(1) %out, align 4
80 define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind {
81 ; SI-LABEL: i8_zext_arg:
83 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
84 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
85 ; SI-NEXT: s_mov_b32 s3, 0xf000
86 ; SI-NEXT: s_waitcnt lgkmcnt(0)
87 ; SI-NEXT: s_and_b32 s4, s2, 0xff
88 ; SI-NEXT: s_mov_b32 s2, -1
89 ; SI-NEXT: v_mov_b32_e32 v0, s4
90 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
93 ; VI-LABEL: i8_zext_arg:
95 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
96 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
97 ; VI-NEXT: s_waitcnt lgkmcnt(0)
98 ; VI-NEXT: s_and_b32 s2, s2, 0xff
99 ; VI-NEXT: v_mov_b32_e32 v0, s0
100 ; VI-NEXT: v_mov_b32_e32 v1, s1
101 ; VI-NEXT: v_mov_b32_e32 v2, s2
102 ; VI-NEXT: flat_store_dword v[0:1], v2
105 ; GFX9-LABEL: i8_zext_arg:
107 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
108 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
109 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff
112 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
113 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
114 ; GFX9-NEXT: s_endpgm
116 ; EG-LABEL: i8_zext_arg:
118 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
120 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
121 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
124 ; EG-NEXT: Fetch clause starting at 6:
125 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
126 ; EG-NEXT: ALU clause starting at 8:
127 ; EG-NEXT: MOV * T0.X, 0.0,
128 ; EG-NEXT: ALU clause starting at 9:
129 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
130 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
131 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
133 ; CM-LABEL: i8_zext_arg:
135 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
137 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
138 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
141 ; CM-NEXT: Fetch clause starting at 6:
142 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
143 ; CM-NEXT: ALU clause starting at 8:
144 ; CM-NEXT: MOV * T0.X, 0.0,
145 ; CM-NEXT: ALU clause starting at 9:
146 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
147 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
148 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
149 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
150 %ext = zext i8 %in to i32
151 store i32 %ext, ptr addrspace(1) %out, align 4
155 define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind {
156 ; SI-LABEL: i8_sext_arg:
158 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
159 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
160 ; SI-NEXT: s_mov_b32 s3, 0xf000
161 ; SI-NEXT: s_waitcnt lgkmcnt(0)
162 ; SI-NEXT: s_sext_i32_i8 s4, s2
163 ; SI-NEXT: s_mov_b32 s2, -1
164 ; SI-NEXT: v_mov_b32_e32 v0, s4
165 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
168 ; VI-LABEL: i8_sext_arg:
170 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
171 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
173 ; VI-NEXT: s_sext_i32_i8 s2, s2
174 ; VI-NEXT: v_mov_b32_e32 v0, s0
175 ; VI-NEXT: v_mov_b32_e32 v1, s1
176 ; VI-NEXT: v_mov_b32_e32 v2, s2
177 ; VI-NEXT: flat_store_dword v[0:1], v2
180 ; GFX9-LABEL: i8_sext_arg:
182 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
183 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
184 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
185 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
186 ; GFX9-NEXT: s_sext_i32_i8 s2, s2
187 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
188 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
189 ; GFX9-NEXT: s_endpgm
191 ; EG-LABEL: i8_sext_arg:
193 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
195 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
196 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
199 ; EG-NEXT: Fetch clause starting at 6:
200 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
201 ; EG-NEXT: ALU clause starting at 8:
202 ; EG-NEXT: MOV * T0.X, 0.0,
203 ; EG-NEXT: ALU clause starting at 9:
204 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
205 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
206 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
208 ; CM-LABEL: i8_sext_arg:
210 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
212 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
213 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
216 ; CM-NEXT: Fetch clause starting at 6:
217 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
218 ; CM-NEXT: ALU clause starting at 8:
219 ; CM-NEXT: MOV * T0.X, 0.0,
220 ; CM-NEXT: ALU clause starting at 9:
221 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
222 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
223 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
224 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
225 %ext = sext i8 %in to i32
226 store i32 %ext, ptr addrspace(1) %out, align 4
230 define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind {
233 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
234 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
235 ; SI-NEXT: s_mov_b32 s3, 0xf000
236 ; SI-NEXT: s_waitcnt lgkmcnt(0)
237 ; SI-NEXT: s_and_b32 s4, s2, 0xffff
238 ; SI-NEXT: s_mov_b32 s2, -1
239 ; SI-NEXT: v_mov_b32_e32 v0, s4
240 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
245 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
246 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
247 ; VI-NEXT: s_waitcnt lgkmcnt(0)
248 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
249 ; VI-NEXT: v_mov_b32_e32 v0, s0
250 ; VI-NEXT: v_mov_b32_e32 v1, s1
251 ; VI-NEXT: v_mov_b32_e32 v2, s2
252 ; VI-NEXT: flat_store_dword v[0:1], v2
255 ; GFX9-LABEL: i16_arg:
257 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
258 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
259 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
262 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
263 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
264 ; GFX9-NEXT: s_endpgm
268 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
270 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
271 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
274 ; EG-NEXT: Fetch clause starting at 6:
275 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
276 ; EG-NEXT: ALU clause starting at 8:
277 ; EG-NEXT: MOV * T0.X, 0.0,
278 ; EG-NEXT: ALU clause starting at 9:
279 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
280 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
284 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
286 ; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
287 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
290 ; CM-NEXT: Fetch clause starting at 6:
291 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
292 ; CM-NEXT: ALU clause starting at 8:
293 ; CM-NEXT: MOV * T0.X, 0.0,
294 ; CM-NEXT: ALU clause starting at 9:
295 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
296 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
297 %ext = zext i16 %in to i32
298 store i32 %ext, ptr addrspace(1) %out, align 4
302 define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind {
303 ; SI-LABEL: i16_zext_arg:
305 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
306 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
307 ; SI-NEXT: s_mov_b32 s3, 0xf000
308 ; SI-NEXT: s_waitcnt lgkmcnt(0)
309 ; SI-NEXT: s_and_b32 s4, s2, 0xffff
310 ; SI-NEXT: s_mov_b32 s2, -1
311 ; SI-NEXT: v_mov_b32_e32 v0, s4
312 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
315 ; VI-LABEL: i16_zext_arg:
317 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
318 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
319 ; VI-NEXT: s_waitcnt lgkmcnt(0)
320 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
321 ; VI-NEXT: v_mov_b32_e32 v0, s0
322 ; VI-NEXT: v_mov_b32_e32 v1, s1
323 ; VI-NEXT: v_mov_b32_e32 v2, s2
324 ; VI-NEXT: flat_store_dword v[0:1], v2
327 ; GFX9-LABEL: i16_zext_arg:
329 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
330 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
331 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
332 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
333 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
334 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
335 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
336 ; GFX9-NEXT: s_endpgm
338 ; EG-LABEL: i16_zext_arg:
340 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
342 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
343 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
346 ; EG-NEXT: Fetch clause starting at 6:
347 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
348 ; EG-NEXT: ALU clause starting at 8:
349 ; EG-NEXT: MOV * T0.X, 0.0,
350 ; EG-NEXT: ALU clause starting at 9:
351 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
352 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
353 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
355 ; CM-LABEL: i16_zext_arg:
357 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
359 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
360 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
363 ; CM-NEXT: Fetch clause starting at 6:
364 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
365 ; CM-NEXT: ALU clause starting at 8:
366 ; CM-NEXT: MOV * T0.X, 0.0,
367 ; CM-NEXT: ALU clause starting at 9:
368 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
369 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
370 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
371 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
372 %ext = zext i16 %in to i32
373 store i32 %ext, ptr addrspace(1) %out, align 4
377 define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind {
378 ; SI-LABEL: i16_sext_arg:
380 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
381 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
382 ; SI-NEXT: s_mov_b32 s3, 0xf000
383 ; SI-NEXT: s_waitcnt lgkmcnt(0)
384 ; SI-NEXT: s_sext_i32_i16 s4, s2
385 ; SI-NEXT: s_mov_b32 s2, -1
386 ; SI-NEXT: v_mov_b32_e32 v0, s4
387 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
390 ; VI-LABEL: i16_sext_arg:
392 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
393 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
394 ; VI-NEXT: s_waitcnt lgkmcnt(0)
395 ; VI-NEXT: s_sext_i32_i16 s2, s2
396 ; VI-NEXT: v_mov_b32_e32 v0, s0
397 ; VI-NEXT: v_mov_b32_e32 v1, s1
398 ; VI-NEXT: v_mov_b32_e32 v2, s2
399 ; VI-NEXT: flat_store_dword v[0:1], v2
402 ; GFX9-LABEL: i16_sext_arg:
404 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
405 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
406 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
407 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
408 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
409 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
410 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
411 ; GFX9-NEXT: s_endpgm
413 ; EG-LABEL: i16_sext_arg:
415 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
417 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
418 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
421 ; EG-NEXT: Fetch clause starting at 6:
422 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
423 ; EG-NEXT: ALU clause starting at 8:
424 ; EG-NEXT: MOV * T0.X, 0.0,
425 ; EG-NEXT: ALU clause starting at 9:
426 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
427 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
428 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
430 ; CM-LABEL: i16_sext_arg:
432 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
434 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
435 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
438 ; CM-NEXT: Fetch clause starting at 6:
439 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
440 ; CM-NEXT: ALU clause starting at 8:
441 ; CM-NEXT: MOV * T0.X, 0.0,
442 ; CM-NEXT: ALU clause starting at 9:
443 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
444 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
445 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
446 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
447 %ext = sext i16 %in to i32
448 store i32 %ext, ptr addrspace(1) %out, align 4
452 define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind {
454 ; SI: ; %bb.0: ; %entry
455 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
456 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
457 ; SI-NEXT: s_mov_b32 s3, 0xf000
458 ; SI-NEXT: s_mov_b32 s2, -1
459 ; SI-NEXT: s_waitcnt lgkmcnt(0)
460 ; SI-NEXT: v_mov_b32_e32 v0, s6
461 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
465 ; VI: ; %bb.0: ; %entry
466 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
467 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
468 ; VI-NEXT: s_waitcnt lgkmcnt(0)
469 ; VI-NEXT: v_mov_b32_e32 v0, s0
470 ; VI-NEXT: v_mov_b32_e32 v1, s1
471 ; VI-NEXT: v_mov_b32_e32 v2, s2
472 ; VI-NEXT: flat_store_dword v[0:1], v2
475 ; GFX9-LABEL: i32_arg:
476 ; GFX9: ; %bb.0: ; %entry
477 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
478 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
479 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
480 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
482 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
483 ; GFX9-NEXT: s_endpgm
486 ; EG: ; %bb.0: ; %entry
487 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
488 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
491 ; EG-NEXT: ALU clause starting at 4:
492 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
493 ; EG-NEXT: MOV * T1.X, KC0[2].Z,
494 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
497 ; CM: ; %bb.0: ; %entry
498 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
499 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
502 ; CM-NEXT: ALU clause starting at 4:
503 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
504 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
505 ; CM-NEXT: MOV * T1.X, KC0[2].Z,
507 store i32 %in, ptr addrspace(1) %out, align 4
511 define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind {
513 ; SI: ; %bb.0: ; %entry
514 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
515 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
516 ; SI-NEXT: s_mov_b32 s3, 0xf000
517 ; SI-NEXT: s_mov_b32 s2, -1
518 ; SI-NEXT: s_waitcnt lgkmcnt(0)
519 ; SI-NEXT: v_mov_b32_e32 v0, s6
520 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
524 ; VI: ; %bb.0: ; %entry
525 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
526 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
527 ; VI-NEXT: s_waitcnt lgkmcnt(0)
528 ; VI-NEXT: v_mov_b32_e32 v0, s0
529 ; VI-NEXT: v_mov_b32_e32 v1, s1
530 ; VI-NEXT: v_mov_b32_e32 v2, s2
531 ; VI-NEXT: flat_store_dword v[0:1], v2
534 ; GFX9-LABEL: f32_arg:
535 ; GFX9: ; %bb.0: ; %entry
536 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
537 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
538 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
539 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
541 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
542 ; GFX9-NEXT: s_endpgm
545 ; EG: ; %bb.0: ; %entry
546 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
547 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
550 ; EG-NEXT: ALU clause starting at 4:
551 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
552 ; EG-NEXT: MOV * T1.X, KC0[2].Z,
553 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
556 ; CM: ; %bb.0: ; %entry
557 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
558 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
561 ; CM-NEXT: ALU clause starting at 4:
562 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
563 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
564 ; CM-NEXT: MOV * T1.X, KC0[2].Z,
566 store float %in, ptr addrspace(1) %out, align 4
570 define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
571 ; SI-LABEL: v2i8_arg:
572 ; SI: ; %bb.0: ; %entry
573 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
574 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
575 ; SI-NEXT: s_mov_b32 s3, 0xf000
576 ; SI-NEXT: s_mov_b32 s2, -1
577 ; SI-NEXT: s_waitcnt lgkmcnt(0)
578 ; SI-NEXT: v_mov_b32_e32 v0, s6
579 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
582 ; VI-LABEL: v2i8_arg:
583 ; VI: ; %bb.0: ; %entry
584 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
585 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
586 ; VI-NEXT: s_waitcnt lgkmcnt(0)
587 ; VI-NEXT: v_mov_b32_e32 v0, s0
588 ; VI-NEXT: v_mov_b32_e32 v1, s1
589 ; VI-NEXT: v_mov_b32_e32 v2, s2
590 ; VI-NEXT: flat_store_short v[0:1], v2
593 ; GFX9-LABEL: v2i8_arg:
594 ; GFX9: ; %bb.0: ; %entry
595 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
596 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
597 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
598 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
600 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
601 ; GFX9-NEXT: s_endpgm
603 ; EG-LABEL: v2i8_arg:
604 ; EG: ; %bb.0: ; %entry
605 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
607 ; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
608 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
611 ; EG-NEXT: Fetch clause starting at 6:
612 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
613 ; EG-NEXT: ALU clause starting at 8:
614 ; EG-NEXT: MOV * T0.X, 0.0,
615 ; EG-NEXT: ALU clause starting at 9:
616 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
617 ; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
618 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
619 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
620 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
621 ; EG-NEXT: LSHL T0.X, T1.W, PV.W,
622 ; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
623 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
624 ; EG-NEXT: MOV T0.Y, 0.0,
625 ; EG-NEXT: MOV * T0.Z, 0.0,
626 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
627 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
629 ; CM-LABEL: v2i8_arg:
630 ; CM: ; %bb.0: ; %entry
631 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
633 ; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
634 ; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
637 ; CM-NEXT: Fetch clause starting at 6:
638 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
639 ; CM-NEXT: ALU clause starting at 8:
640 ; CM-NEXT: MOV * T0.X, 0.0,
641 ; CM-NEXT: ALU clause starting at 9:
642 ; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
643 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
644 ; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
645 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
646 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
647 ; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
648 ; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
649 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
650 ; CM-NEXT: MOV T0.Y, 0.0,
651 ; CM-NEXT: MOV * T0.Z, 0.0,
652 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
653 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
655 store <2 x i8> %in, ptr addrspace(1) %out
659 define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
660 ; SI-LABEL: v2i16_arg:
661 ; SI: ; %bb.0: ; %entry
662 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
663 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
664 ; SI-NEXT: s_mov_b32 s3, 0xf000
665 ; SI-NEXT: s_mov_b32 s2, -1
666 ; SI-NEXT: s_waitcnt lgkmcnt(0)
667 ; SI-NEXT: v_mov_b32_e32 v0, s6
668 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
671 ; VI-LABEL: v2i16_arg:
672 ; VI: ; %bb.0: ; %entry
673 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
674 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
675 ; VI-NEXT: s_waitcnt lgkmcnt(0)
676 ; VI-NEXT: v_mov_b32_e32 v0, s0
677 ; VI-NEXT: v_mov_b32_e32 v1, s1
678 ; VI-NEXT: v_mov_b32_e32 v2, s2
679 ; VI-NEXT: flat_store_dword v[0:1], v2
682 ; GFX9-LABEL: v2i16_arg:
683 ; GFX9: ; %bb.0: ; %entry
684 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
685 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
686 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
687 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
688 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
689 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
690 ; GFX9-NEXT: s_endpgm
692 ; EG-LABEL: v2i16_arg:
693 ; EG: ; %bb.0: ; %entry
694 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
695 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
698 ; EG-NEXT: ALU clause starting at 4:
699 ; EG-NEXT: MOV T0.X, KC0[2].Z,
700 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
701 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
703 ; CM-LABEL: v2i16_arg:
704 ; CM: ; %bb.0: ; %entry
705 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
706 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
709 ; CM-NEXT: ALU clause starting at 4:
710 ; CM-NEXT: MOV * T0.X, KC0[2].Z,
711 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
712 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
714 store <2 x i16> %in, ptr addrspace(1) %out
718 define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind {
719 ; SI-LABEL: v2i32_arg:
720 ; SI: ; %bb.0: ; %entry
721 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
722 ; SI-NEXT: s_mov_b32 s7, 0xf000
723 ; SI-NEXT: s_mov_b32 s6, -1
724 ; SI-NEXT: s_waitcnt lgkmcnt(0)
725 ; SI-NEXT: s_mov_b32 s4, s0
726 ; SI-NEXT: s_mov_b32 s5, s1
727 ; SI-NEXT: v_mov_b32_e32 v0, s2
728 ; SI-NEXT: v_mov_b32_e32 v1, s3
729 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
732 ; VI-LABEL: v2i32_arg:
733 ; VI: ; %bb.0: ; %entry
734 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
735 ; VI-NEXT: s_waitcnt lgkmcnt(0)
736 ; VI-NEXT: v_mov_b32_e32 v0, s0
737 ; VI-NEXT: v_mov_b32_e32 v2, s2
738 ; VI-NEXT: v_mov_b32_e32 v1, s1
739 ; VI-NEXT: v_mov_b32_e32 v3, s3
740 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
743 ; GFX9-LABEL: v2i32_arg:
744 ; GFX9: ; %bb.0: ; %entry
745 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
746 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
747 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
748 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
749 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
750 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
751 ; GFX9-NEXT: s_endpgm
753 ; EG-LABEL: v2i32_arg:
754 ; EG: ; %bb.0: ; %entry
755 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
756 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
759 ; EG-NEXT: ALU clause starting at 4:
760 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
761 ; EG-NEXT: MOV T0.X, KC0[2].W,
762 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
763 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
765 ; CM-LABEL: v2i32_arg:
766 ; CM: ; %bb.0: ; %entry
767 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
768 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
771 ; CM-NEXT: ALU clause starting at 4:
772 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
773 ; CM-NEXT: MOV * T0.X, KC0[2].W,
774 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
775 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
777 store <2 x i32> %in, ptr addrspace(1) %out, align 4
781 define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind {
782 ; SI-LABEL: v2f32_arg:
783 ; SI: ; %bb.0: ; %entry
784 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
785 ; SI-NEXT: s_mov_b32 s7, 0xf000
786 ; SI-NEXT: s_mov_b32 s6, -1
787 ; SI-NEXT: s_waitcnt lgkmcnt(0)
788 ; SI-NEXT: s_mov_b32 s4, s0
789 ; SI-NEXT: s_mov_b32 s5, s1
790 ; SI-NEXT: v_mov_b32_e32 v0, s2
791 ; SI-NEXT: v_mov_b32_e32 v1, s3
792 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
795 ; VI-LABEL: v2f32_arg:
796 ; VI: ; %bb.0: ; %entry
797 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
798 ; VI-NEXT: s_waitcnt lgkmcnt(0)
799 ; VI-NEXT: v_mov_b32_e32 v0, s0
800 ; VI-NEXT: v_mov_b32_e32 v2, s2
801 ; VI-NEXT: v_mov_b32_e32 v1, s1
802 ; VI-NEXT: v_mov_b32_e32 v3, s3
803 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
806 ; GFX9-LABEL: v2f32_arg:
807 ; GFX9: ; %bb.0: ; %entry
808 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
809 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
810 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
811 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
812 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
813 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
814 ; GFX9-NEXT: s_endpgm
816 ; EG-LABEL: v2f32_arg:
817 ; EG: ; %bb.0: ; %entry
818 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
819 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
822 ; EG-NEXT: ALU clause starting at 4:
823 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
824 ; EG-NEXT: MOV T0.X, KC0[2].W,
825 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
826 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
828 ; CM-LABEL: v2f32_arg:
829 ; CM: ; %bb.0: ; %entry
830 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
831 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
834 ; CM-NEXT: ALU clause starting at 4:
835 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
836 ; CM-NEXT: MOV * T0.X, KC0[2].W,
837 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
838 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
840 store <2 x float> %in, ptr addrspace(1) %out, align 4
844 define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
845 ; SI-LABEL: v3i8_arg:
846 ; SI: ; %bb.0: ; %entry
847 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
848 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
849 ; SI-NEXT: s_mov_b32 s3, 0xf000
850 ; SI-NEXT: s_waitcnt lgkmcnt(0)
851 ; SI-NEXT: s_lshr_b32 s4, s6, 16
852 ; SI-NEXT: s_mov_b32 s2, -1
853 ; SI-NEXT: v_mov_b32_e32 v0, s6
854 ; SI-NEXT: v_mov_b32_e32 v1, s4
855 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
856 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
859 ; VI-LABEL: v3i8_arg:
860 ; VI: ; %bb.0: ; %entry
861 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
862 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
863 ; VI-NEXT: s_waitcnt lgkmcnt(0)
864 ; VI-NEXT: s_lshr_b32 s3, s2, 16
865 ; VI-NEXT: v_mov_b32_e32 v0, s0
866 ; VI-NEXT: v_mov_b32_e32 v1, s1
867 ; VI-NEXT: s_add_u32 s0, s0, 2
868 ; VI-NEXT: s_addc_u32 s1, s1, 0
869 ; VI-NEXT: v_mov_b32_e32 v3, s1
870 ; VI-NEXT: v_mov_b32_e32 v5, s3
871 ; VI-NEXT: v_mov_b32_e32 v2, s0
872 ; VI-NEXT: v_mov_b32_e32 v4, s2
873 ; VI-NEXT: flat_store_byte v[2:3], v5
874 ; VI-NEXT: flat_store_short v[0:1], v4
877 ; GFX9-LABEL: v3i8_arg:
878 ; GFX9: ; %bb.0: ; %entry
879 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
880 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
881 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
882 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
883 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
884 ; GFX9-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2
885 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
886 ; GFX9-NEXT: s_endpgm
888 ; EG-LABEL: v3i8_arg:
889 ; EG: ; %bb.0: ; %entry
890 ; EG-NEXT: ALU 0, @12, KC0[], KC1[]
892 ; EG-NEXT: ALU 28, @13, KC0[CB0:0-32], KC1[]
893 ; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X
894 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
896 ; EG-NEXT: Fetch clause starting at 6:
897 ; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
898 ; EG-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3
899 ; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
900 ; EG-NEXT: ALU clause starting at 12:
901 ; EG-NEXT: MOV * T4.X, 0.0,
902 ; EG-NEXT: ALU clause starting at 13:
903 ; EG-NEXT: LSHL T0.W, T5.X, literal.x,
904 ; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
905 ; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
906 ; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
907 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
908 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
909 ; EG-NEXT: AND_INT T0.W, PS, literal.x,
910 ; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
911 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
912 ; EG-NEXT: LSHL T4.X, PV.W, PS,
913 ; EG-NEXT: LSHL * T4.W, literal.x, PS,
914 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
915 ; EG-NEXT: MOV T4.Y, 0.0,
916 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
917 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
918 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
919 ; EG-NEXT: AND_INT * T2.W, T6.X, literal.y,
920 ; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
921 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
922 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
923 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
924 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
925 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
926 ; EG-NEXT: MOV T5.Y, 0.0,
927 ; EG-NEXT: MOV T4.Z, 0.0,
928 ; EG-NEXT: MOV * T5.Z, 0.0,
929 ; EG-NEXT: LSHR T6.X, T0.W, literal.x,
930 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
931 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
933 ; CM-LABEL: v3i8_arg:
934 ; CM: ; %bb.0: ; %entry
935 ; CM-NEXT: ALU 0, @12, KC0[], KC1[]
937 ; CM-NEXT: ALU 29, @13, KC0[CB0:0-32], KC1[]
938 ; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X
939 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T6.X
941 ; CM-NEXT: Fetch clause starting at 6:
942 ; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
943 ; CM-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3
944 ; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
945 ; CM-NEXT: ALU clause starting at 12:
946 ; CM-NEXT: MOV * T4.X, 0.0,
947 ; CM-NEXT: ALU clause starting at 13:
948 ; CM-NEXT: LSHL T0.Z, T5.X, literal.x,
949 ; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
950 ; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43)
951 ; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x,
952 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
953 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
954 ; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
955 ; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
956 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
957 ; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
958 ; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
959 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
960 ; CM-NEXT: MOV T4.Y, 0.0,
961 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
962 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
963 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
964 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
965 ; CM-NEXT: AND_INT T0.Z, T6.X, literal.x,
966 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
967 ; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
968 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
969 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
970 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
971 ; CM-NEXT: MOV T5.Y, 0.0,
972 ; CM-NEXT: MOV * T4.Z, 0.0,
973 ; CM-NEXT: MOV * T5.Z, 0.0,
974 ; CM-NEXT: LSHR * T6.X, T0.W, literal.x,
975 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
976 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
977 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
979 store <3 x i8> %in, ptr addrspace(1) %out, align 4
983 define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
984 ; SI-LABEL: v3i16_arg:
985 ; SI: ; %bb.0: ; %entry
986 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
987 ; SI-NEXT: s_mov_b32 s7, 0xf000
988 ; SI-NEXT: s_mov_b32 s6, -1
989 ; SI-NEXT: s_waitcnt lgkmcnt(0)
990 ; SI-NEXT: s_mov_b32 s4, s0
991 ; SI-NEXT: s_mov_b32 s5, s1
992 ; SI-NEXT: v_mov_b32_e32 v0, s3
993 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
994 ; SI-NEXT: s_waitcnt expcnt(0)
995 ; SI-NEXT: v_mov_b32_e32 v0, s2
996 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
999 ; VI-LABEL: v3i16_arg:
1000 ; VI: ; %bb.0: ; %entry
1001 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1002 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1003 ; VI-NEXT: s_add_u32 s4, s0, 4
1004 ; VI-NEXT: s_addc_u32 s5, s1, 0
1005 ; VI-NEXT: v_mov_b32_e32 v2, s4
1006 ; VI-NEXT: v_mov_b32_e32 v4, s3
1007 ; VI-NEXT: v_mov_b32_e32 v0, s0
1008 ; VI-NEXT: v_mov_b32_e32 v3, s5
1009 ; VI-NEXT: v_mov_b32_e32 v1, s1
1010 ; VI-NEXT: v_mov_b32_e32 v5, s2
1011 ; VI-NEXT: flat_store_short v[2:3], v4
1012 ; VI-NEXT: flat_store_dword v[0:1], v5
1015 ; GFX9-LABEL: v3i16_arg:
1016 ; GFX9: ; %bb.0: ; %entry
1017 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1018 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1019 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1020 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1021 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1022 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4
1023 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
1024 ; GFX9-NEXT: s_endpgm
1026 ; EG-LABEL: v3i16_arg:
1027 ; EG: ; %bb.0: ; %entry
1028 ; EG-NEXT: ALU 0, @12, KC0[], KC1[]
1030 ; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
1031 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
1032 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
1034 ; EG-NEXT: Fetch clause starting at 6:
1035 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
1036 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
1037 ; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
1038 ; EG-NEXT: ALU clause starting at 12:
1039 ; EG-NEXT: MOV * T5.X, 0.0,
1040 ; EG-NEXT: ALU clause starting at 13:
1041 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1042 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1043 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
1044 ; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
1045 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1046 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1047 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1048 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
1049 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
1050 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1051 ; EG-NEXT: MOV T5.Y, 0.0,
1052 ; EG-NEXT: MOV * T5.Z, 0.0,
1053 ; EG-NEXT: LSHR T8.X, T0.W, literal.x,
1054 ; EG-NEXT: LSHL T0.W, T7.X, literal.y,
1055 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
1056 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
1057 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1058 ; EG-NEXT: OR_INT T6.X, PV.W, PS,
1059 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1060 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1062 ; CM-LABEL: v3i16_arg:
1063 ; CM: ; %bb.0: ; %entry
1064 ; CM-NEXT: ALU 0, @12, KC0[], KC1[]
1066 ; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
1067 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X
1068 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
1070 ; CM-NEXT: Fetch clause starting at 6:
1071 ; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
1072 ; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
1073 ; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
1074 ; CM-NEXT: ALU clause starting at 12:
1075 ; CM-NEXT: MOV * T5.X, 0.0,
1076 ; CM-NEXT: ALU clause starting at 13:
1077 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1078 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1079 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
1080 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1081 ; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
1082 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
1083 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1084 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
1085 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
1086 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1087 ; CM-NEXT: MOV T5.Y, 0.0,
1088 ; CM-NEXT: MOV * T5.Z, 0.0,
1089 ; CM-NEXT: LSHL T0.Z, T7.X, literal.x,
1090 ; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
1091 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
1092 ; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
1093 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1094 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1095 ; CM-NEXT: LSHR * T8.X, T0.W, literal.x,
1096 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1098 store <3 x i16> %in, ptr addrspace(1) %out, align 4
1102 define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
1103 ; SI-LABEL: v3i32_arg:
1104 ; SI: ; %bb.0: ; %entry
1105 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
1106 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1107 ; SI-NEXT: s_mov_b32 s7, 0xf000
1108 ; SI-NEXT: s_mov_b32 s6, -1
1109 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1110 ; SI-NEXT: v_mov_b32_e32 v0, s2
1111 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
1112 ; SI-NEXT: s_waitcnt expcnt(0)
1113 ; SI-NEXT: v_mov_b32_e32 v0, s0
1114 ; SI-NEXT: v_mov_b32_e32 v1, s1
1115 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1118 ; VI-LABEL: v3i32_arg:
1119 ; VI: ; %bb.0: ; %entry
1120 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
1121 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
1122 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1123 ; VI-NEXT: v_mov_b32_e32 v0, s0
1124 ; VI-NEXT: v_mov_b32_e32 v3, s4
1125 ; VI-NEXT: v_mov_b32_e32 v1, s1
1126 ; VI-NEXT: v_mov_b32_e32 v2, s2
1127 ; VI-NEXT: v_mov_b32_e32 v4, s5
1128 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
1131 ; GFX9-LABEL: v3i32_arg:
1132 ; GFX9: ; %bb.0: ; %entry
1133 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
1134 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1135 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1136 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1137 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1138 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1139 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1140 ; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
1141 ; GFX9-NEXT: s_endpgm
1143 ; EG-LABEL: v3i32_arg:
1144 ; EG: ; %bb.0: ; %entry
1145 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1146 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1147 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1149 ; EG-NEXT: ALU clause starting at 4:
1150 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1151 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1152 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1153 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1154 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1155 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1156 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1157 ; EG-NEXT: MOV * T3.X, KC0[3].W,
1158 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1160 ; CM-LABEL: v3i32_arg:
1161 ; CM: ; %bb.0: ; %entry
1162 ; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1163 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1164 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1166 ; CM-NEXT: ALU clause starting at 4:
1167 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1168 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1169 ; CM-NEXT: LSHR * T0.X, PV.W, literal.x,
1170 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1171 ; CM-NEXT: MOV T1.X, KC0[3].W,
1172 ; CM-NEXT: MOV * T2.Y, KC0[3].Z,
1173 ; CM-NEXT: MOV * T2.X, KC0[3].Y,
1174 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
1175 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1177 store <3 x i32> %in, ptr addrspace(1) %out, align 4
1181 define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
1182 ; SI-LABEL: v3f32_arg:
1183 ; SI: ; %bb.0: ; %entry
1184 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
1185 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1186 ; SI-NEXT: s_mov_b32 s7, 0xf000
1187 ; SI-NEXT: s_mov_b32 s6, -1
1188 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1189 ; SI-NEXT: v_mov_b32_e32 v0, s2
1190 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
1191 ; SI-NEXT: s_waitcnt expcnt(0)
1192 ; SI-NEXT: v_mov_b32_e32 v0, s0
1193 ; SI-NEXT: v_mov_b32_e32 v1, s1
1194 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1197 ; VI-LABEL: v3f32_arg:
1198 ; VI: ; %bb.0: ; %entry
1199 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
1200 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
1201 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1202 ; VI-NEXT: v_mov_b32_e32 v0, s0
1203 ; VI-NEXT: v_mov_b32_e32 v3, s4
1204 ; VI-NEXT: v_mov_b32_e32 v1, s1
1205 ; VI-NEXT: v_mov_b32_e32 v2, s2
1206 ; VI-NEXT: v_mov_b32_e32 v4, s5
1207 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
1210 ; GFX9-LABEL: v3f32_arg:
1211 ; GFX9: ; %bb.0: ; %entry
1212 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
1213 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1214 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1215 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1216 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1217 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1218 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1219 ; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
1220 ; GFX9-NEXT: s_endpgm
1222 ; EG-LABEL: v3f32_arg:
1223 ; EG: ; %bb.0: ; %entry
1224 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1225 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1226 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1228 ; EG-NEXT: ALU clause starting at 4:
1229 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1230 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1231 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1232 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1233 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1234 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1235 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1236 ; EG-NEXT: MOV * T3.X, KC0[3].W,
1237 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1239 ; CM-LABEL: v3f32_arg:
1240 ; CM: ; %bb.0: ; %entry
1241 ; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1242 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1243 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1245 ; CM-NEXT: ALU clause starting at 4:
1246 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1247 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1248 ; CM-NEXT: LSHR * T0.X, PV.W, literal.x,
1249 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1250 ; CM-NEXT: MOV T1.X, KC0[3].W,
1251 ; CM-NEXT: MOV * T2.Y, KC0[3].Z,
1252 ; CM-NEXT: MOV * T2.X, KC0[3].Y,
1253 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
1254 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1256 store <3 x float> %in, ptr addrspace(1) %out, align 4
1260 define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
1261 ; SI-LABEL: v4i8_arg:
1262 ; SI: ; %bb.0: ; %entry
1263 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
1264 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
1265 ; SI-NEXT: s_mov_b32 s3, 0xf000
1266 ; SI-NEXT: s_mov_b32 s2, -1
1267 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1268 ; SI-NEXT: v_mov_b32_e32 v0, s6
1269 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1272 ; VI-LABEL: v4i8_arg:
1273 ; VI: ; %bb.0: ; %entry
1274 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1275 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
1276 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1277 ; VI-NEXT: v_mov_b32_e32 v0, s0
1278 ; VI-NEXT: v_mov_b32_e32 v1, s1
1279 ; VI-NEXT: v_mov_b32_e32 v2, s2
1280 ; VI-NEXT: flat_store_dword v[0:1], v2
1283 ; GFX9-LABEL: v4i8_arg:
1284 ; GFX9: ; %bb.0: ; %entry
1285 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
1286 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1287 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1288 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1289 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1290 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1291 ; GFX9-NEXT: s_endpgm
1293 ; EG-LABEL: v4i8_arg:
1294 ; EG: ; %bb.0: ; %entry
1295 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1296 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1299 ; EG-NEXT: ALU clause starting at 4:
1300 ; EG-NEXT: MOV T0.X, KC0[2].Z,
1301 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1302 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1304 ; CM-LABEL: v4i8_arg:
1305 ; CM: ; %bb.0: ; %entry
1306 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1307 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1310 ; CM-NEXT: ALU clause starting at 4:
1311 ; CM-NEXT: MOV * T0.X, KC0[2].Z,
1312 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1313 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1315 store <4 x i8> %in, ptr addrspace(1) %out
1319 define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
1320 ; SI-LABEL: v4i16_arg:
1321 ; SI: ; %bb.0: ; %entry
1322 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1323 ; SI-NEXT: s_mov_b32 s7, 0xf000
1324 ; SI-NEXT: s_mov_b32 s6, -1
1325 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1326 ; SI-NEXT: s_mov_b32 s4, s0
1327 ; SI-NEXT: s_mov_b32 s5, s1
1328 ; SI-NEXT: v_mov_b32_e32 v0, s2
1329 ; SI-NEXT: v_mov_b32_e32 v1, s3
1330 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1333 ; VI-LABEL: v4i16_arg:
1334 ; VI: ; %bb.0: ; %entry
1335 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1336 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1337 ; VI-NEXT: v_mov_b32_e32 v0, s0
1338 ; VI-NEXT: v_mov_b32_e32 v2, s2
1339 ; VI-NEXT: v_mov_b32_e32 v1, s1
1340 ; VI-NEXT: v_mov_b32_e32 v3, s3
1341 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1344 ; GFX9-LABEL: v4i16_arg:
1345 ; GFX9: ; %bb.0: ; %entry
1346 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1347 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1348 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1349 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1350 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1351 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1352 ; GFX9-NEXT: s_endpgm
1354 ; EG-LABEL: v4i16_arg:
1355 ; EG: ; %bb.0: ; %entry
1356 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1357 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1360 ; EG-NEXT: ALU clause starting at 4:
1361 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
1362 ; EG-NEXT: MOV T0.X, KC0[2].W,
1363 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1364 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1366 ; CM-LABEL: v4i16_arg:
1367 ; CM: ; %bb.0: ; %entry
1368 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1369 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1372 ; CM-NEXT: ALU clause starting at 4:
1373 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
1374 ; CM-NEXT: MOV * T0.X, KC0[2].W,
1375 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1376 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1378 store <4 x i16> %in, ptr addrspace(1) %out
1382 define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind {
1383 ; SI-LABEL: v4i32_arg:
1384 ; SI: ; %bb.0: ; %entry
1385 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
1386 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1387 ; SI-NEXT: s_mov_b32 s7, 0xf000
1388 ; SI-NEXT: s_mov_b32 s6, -1
1389 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1390 ; SI-NEXT: v_mov_b32_e32 v0, s0
1391 ; SI-NEXT: v_mov_b32_e32 v1, s1
1392 ; SI-NEXT: v_mov_b32_e32 v2, s2
1393 ; SI-NEXT: v_mov_b32_e32 v3, s3
1394 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1397 ; VI-LABEL: v4i32_arg:
1398 ; VI: ; %bb.0: ; %entry
1399 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
1400 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
1401 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1402 ; VI-NEXT: v_mov_b32_e32 v4, s6
1403 ; VI-NEXT: v_mov_b32_e32 v0, s0
1404 ; VI-NEXT: v_mov_b32_e32 v5, s7
1405 ; VI-NEXT: v_mov_b32_e32 v1, s1
1406 ; VI-NEXT: v_mov_b32_e32 v2, s2
1407 ; VI-NEXT: v_mov_b32_e32 v3, s3
1408 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1411 ; GFX9-LABEL: v4i32_arg:
1412 ; GFX9: ; %bb.0: ; %entry
1413 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
1414 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1415 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1416 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1417 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1418 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1419 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1420 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1421 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
1422 ; GFX9-NEXT: s_endpgm
1424 ; EG-LABEL: v4i32_arg:
1425 ; EG: ; %bb.0: ; %entry
1426 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1427 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1430 ; EG-NEXT: ALU clause starting at 4:
1431 ; EG-NEXT: MOV * T0.W, KC0[4].X,
1432 ; EG-NEXT: MOV * T0.Z, KC0[3].W,
1433 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1434 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1435 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1436 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1438 ; CM-LABEL: v4i32_arg:
1439 ; CM: ; %bb.0: ; %entry
1440 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1441 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1444 ; CM-NEXT: ALU clause starting at 4:
1445 ; CM-NEXT: MOV * T0.W, KC0[4].X,
1446 ; CM-NEXT: MOV * T0.Z, KC0[3].W,
1447 ; CM-NEXT: MOV * T0.Y, KC0[3].Z,
1448 ; CM-NEXT: MOV * T0.X, KC0[3].Y,
1449 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1450 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1452 store <4 x i32> %in, ptr addrspace(1) %out, align 4
1456 define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind {
1457 ; SI-LABEL: v4f32_arg:
1458 ; SI: ; %bb.0: ; %entry
1459 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
1460 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1461 ; SI-NEXT: s_mov_b32 s7, 0xf000
1462 ; SI-NEXT: s_mov_b32 s6, -1
1463 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1464 ; SI-NEXT: v_mov_b32_e32 v0, s0
1465 ; SI-NEXT: v_mov_b32_e32 v1, s1
1466 ; SI-NEXT: v_mov_b32_e32 v2, s2
1467 ; SI-NEXT: v_mov_b32_e32 v3, s3
1468 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1471 ; VI-LABEL: v4f32_arg:
1472 ; VI: ; %bb.0: ; %entry
1473 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
1474 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
1475 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1476 ; VI-NEXT: v_mov_b32_e32 v4, s6
1477 ; VI-NEXT: v_mov_b32_e32 v0, s0
1478 ; VI-NEXT: v_mov_b32_e32 v5, s7
1479 ; VI-NEXT: v_mov_b32_e32 v1, s1
1480 ; VI-NEXT: v_mov_b32_e32 v2, s2
1481 ; VI-NEXT: v_mov_b32_e32 v3, s3
1482 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1485 ; GFX9-LABEL: v4f32_arg:
1486 ; GFX9: ; %bb.0: ; %entry
1487 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
1488 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1489 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1490 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1491 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1492 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1493 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1494 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1495 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
1496 ; GFX9-NEXT: s_endpgm
1498 ; EG-LABEL: v4f32_arg:
1499 ; EG: ; %bb.0: ; %entry
1500 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1501 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1504 ; EG-NEXT: ALU clause starting at 4:
1505 ; EG-NEXT: MOV * T0.W, KC0[4].X,
1506 ; EG-NEXT: MOV * T0.Z, KC0[3].W,
1507 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1508 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1509 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1510 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1512 ; CM-LABEL: v4f32_arg:
1513 ; CM: ; %bb.0: ; %entry
1514 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1515 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1518 ; CM-NEXT: ALU clause starting at 4:
1519 ; CM-NEXT: MOV * T0.W, KC0[4].X,
1520 ; CM-NEXT: MOV * T0.Z, KC0[3].W,
1521 ; CM-NEXT: MOV * T0.Y, KC0[3].Z,
1522 ; CM-NEXT: MOV * T0.X, KC0[3].Y,
1523 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1524 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1526 store <4 x float> %in, ptr addrspace(1) %out, align 4
1530 define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
1531 ; SI-LABEL: v5i8_arg:
1532 ; SI: ; %bb.0: ; %entry
1533 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1534 ; SI-NEXT: s_mov_b32 s7, 0xf000
1535 ; SI-NEXT: s_mov_b32 s6, -1
1536 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1537 ; SI-NEXT: s_mov_b32 s4, s0
1538 ; SI-NEXT: s_mov_b32 s5, s1
1539 ; SI-NEXT: v_mov_b32_e32 v0, s3
1540 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4
1541 ; SI-NEXT: s_waitcnt expcnt(0)
1542 ; SI-NEXT: v_mov_b32_e32 v0, s2
1543 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1546 ; VI-LABEL: v5i8_arg:
1547 ; VI: ; %bb.0: ; %entry
1548 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1549 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1550 ; VI-NEXT: s_add_u32 s4, s0, 4
1551 ; VI-NEXT: s_addc_u32 s5, s1, 0
1552 ; VI-NEXT: v_mov_b32_e32 v2, s4
1553 ; VI-NEXT: v_mov_b32_e32 v4, s3
1554 ; VI-NEXT: v_mov_b32_e32 v0, s0
1555 ; VI-NEXT: v_mov_b32_e32 v3, s5
1556 ; VI-NEXT: v_mov_b32_e32 v1, s1
1557 ; VI-NEXT: v_mov_b32_e32 v5, s2
1558 ; VI-NEXT: flat_store_byte v[2:3], v4
1559 ; VI-NEXT: flat_store_dword v[0:1], v5
1562 ; GFX9-LABEL: v5i8_arg:
1563 ; GFX9: ; %bb.0: ; %entry
1564 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1565 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1566 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1567 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1568 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1569 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:4
1570 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
1571 ; GFX9-NEXT: s_endpgm
1573 ; EG-LABEL: v5i8_arg:
1574 ; EG: ; %bb.0: ; %entry
1575 ; EG-NEXT: ALU 0, @16, KC0[], KC1[]
1577 ; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
1578 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
1579 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
1581 ; EG-NEXT: Fetch clause starting at 6:
1582 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
1583 ; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
1584 ; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
1585 ; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
1586 ; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
1587 ; EG-NEXT: ALU clause starting at 16:
1588 ; EG-NEXT: MOV * T5.X, 0.0,
1589 ; EG-NEXT: ALU clause starting at 17:
1590 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1591 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1592 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
1593 ; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
1594 ; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
1595 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1596 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1597 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
1598 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
1599 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1600 ; EG-NEXT: MOV T5.Y, 0.0,
1601 ; EG-NEXT: MOV T5.Z, 0.0,
1602 ; EG-NEXT: AND_INT T1.W, T9.X, literal.x,
1603 ; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x,
1604 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1605 ; EG-NEXT: LSHL T1.W, PV.W, literal.x,
1606 ; EG-NEXT: LSHL * T2.W, T7.X, literal.y,
1607 ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
1608 ; EG-NEXT: OR_INT T1.W, PS, PV.W,
1609 ; EG-NEXT: LSHL * T2.W, T0.Z, literal.x,
1610 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1611 ; EG-NEXT: OR_INT T1.W, PV.W, PS,
1612 ; EG-NEXT: AND_INT * T2.W, T6.X, literal.x,
1613 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1614 ; EG-NEXT: OR_INT T6.X, PV.W, PS,
1615 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1616 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1617 ; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
1618 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1620 ; CM-LABEL: v5i8_arg:
1621 ; CM: ; %bb.0: ; %entry
1622 ; CM-NEXT: ALU 0, @16, KC0[], KC1[]
1624 ; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
1625 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X
1626 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X
1628 ; CM-NEXT: Fetch clause starting at 6:
1629 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
1630 ; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
1631 ; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
1632 ; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
1633 ; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
1634 ; CM-NEXT: ALU clause starting at 16:
1635 ; CM-NEXT: MOV * T5.X, 0.0,
1636 ; CM-NEXT: ALU clause starting at 17:
1637 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1638 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1639 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
1640 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1641 ; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
1642 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
1643 ; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1644 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
1645 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
1646 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1647 ; CM-NEXT: MOV T5.Y, 0.0,
1648 ; CM-NEXT: MOV T5.Z, 0.0,
1649 ; CM-NEXT: AND_INT * T1.W, T9.X, literal.x,
1650 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1651 ; CM-NEXT: AND_INT T0.Y, T8.X, literal.x,
1652 ; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
1653 ; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212
1654 ; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
1655 ; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1656 ; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
1657 ; CM-NEXT: LSHL * T1.W, PV.Y, literal.x,
1658 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1659 ; CM-NEXT: LSHR T7.X, T0.W, literal.x,
1660 ; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
1661 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
1662 ; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43)
1663 ; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
1664 ; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
1665 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1667 store <5 x i8> %in, ptr addrspace(1) %out, align 4
1671 define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind {
1672 ; SI-LABEL: v5i16_arg:
1673 ; SI: ; %bb.0: ; %entry
1674 ; SI-NEXT: s_load_dword s6, s[4:5], 0xf
1675 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
1676 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1677 ; SI-NEXT: s_mov_b32 s3, 0xf000
1678 ; SI-NEXT: s_mov_b32 s2, -1
1679 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1680 ; SI-NEXT: v_mov_b32_e32 v0, s6
1681 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8
1682 ; SI-NEXT: s_waitcnt expcnt(0)
1683 ; SI-NEXT: v_mov_b32_e32 v0, s4
1684 ; SI-NEXT: v_mov_b32_e32 v1, s5
1685 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1688 ; VI-LABEL: v5i16_arg:
1689 ; VI: ; %bb.0: ; %entry
1690 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1691 ; VI-NEXT: s_load_dword s6, s[4:5], 0x3c
1692 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
1693 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1694 ; VI-NEXT: s_add_u32 s4, s0, 8
1695 ; VI-NEXT: s_addc_u32 s5, s1, 0
1696 ; VI-NEXT: v_mov_b32_e32 v2, s4
1697 ; VI-NEXT: v_mov_b32_e32 v4, s6
1698 ; VI-NEXT: v_mov_b32_e32 v3, s5
1699 ; VI-NEXT: v_mov_b32_e32 v0, s0
1700 ; VI-NEXT: flat_store_short v[2:3], v4
1701 ; VI-NEXT: v_mov_b32_e32 v2, s2
1702 ; VI-NEXT: v_mov_b32_e32 v1, s1
1703 ; VI-NEXT: v_mov_b32_e32 v3, s3
1704 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1707 ; GFX9-LABEL: v5i16_arg:
1708 ; GFX9: ; %bb.0: ; %entry
1709 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
1710 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1711 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1712 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1713 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
1714 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1715 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1716 ; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8
1717 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1718 ; GFX9-NEXT: s_endpgm
1720 ; EG-LABEL: v5i16_arg:
1721 ; EG: ; %bb.0: ; %entry
1722 ; EG-NEXT: ALU 0, @20, KC0[], KC1[]
1723 ; EG-NEXT: TEX 4 @10
1724 ; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[]
1725 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X
1726 ; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X
1727 ; EG-NEXT: MEM_RAT MSKOR T3.XW, T2.X
1728 ; EG-NEXT: MEM_RAT MSKOR T6.XW, T1.X
1729 ; EG-NEXT: MEM_RAT MSKOR T8.XW, T0.X
1732 ; EG-NEXT: Fetch clause starting at 10:
1733 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3
1734 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3
1735 ; EG-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3
1736 ; EG-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3
1737 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
1738 ; EG-NEXT: ALU clause starting at 20:
1739 ; EG-NEXT: MOV * T0.X, 0.0,
1740 ; EG-NEXT: ALU clause starting at 21:
1741 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1742 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1743 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
1744 ; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
1745 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1746 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1747 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1748 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
1749 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
1750 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1751 ; EG-NEXT: MOV T5.Y, 0.0,
1752 ; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x,
1753 ; EG-NEXT: AND_INT * T2.W, T4.X, literal.y,
1754 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1755 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1756 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1757 ; EG-NEXT: LSHL T4.X, T2.W, PV.W,
1758 ; EG-NEXT: LSHL * T4.W, literal.x, PV.W,
1759 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1760 ; EG-NEXT: MOV T4.Y, 0.0,
1761 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
1762 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1763 ; EG-NEXT: AND_INT T2.W, PV.W, literal.x,
1764 ; EG-NEXT: AND_INT * T3.W, T3.X, literal.y,
1765 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1766 ; EG-NEXT: LSHL * T2.W, PV.W, literal.x,
1767 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1768 ; EG-NEXT: LSHL T3.X, T3.W, PV.W,
1769 ; EG-NEXT: LSHL * T3.W, literal.x, PV.W,
1770 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1771 ; EG-NEXT: MOV T3.Y, 0.0,
1772 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
1773 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1774 ; EG-NEXT: AND_INT T6.W, PV.W, literal.x,
1775 ; EG-NEXT: AND_INT * T7.W, T2.X, literal.y,
1776 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1777 ; EG-NEXT: LSHL * T6.W, PV.W, literal.x,
1778 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1779 ; EG-NEXT: LSHL T6.X, T7.W, PV.W,
1780 ; EG-NEXT: LSHL * T6.W, literal.x, PV.W,
1781 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1782 ; EG-NEXT: MOV T6.Y, 0.0,
1783 ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
1784 ; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
1785 ; EG-NEXT: AND_INT T8.W, PV.W, literal.x,
1786 ; EG-NEXT: AND_INT * T9.W, T1.X, literal.y,
1787 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1788 ; EG-NEXT: LSHL * T8.W, PV.W, literal.x,
1789 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1790 ; EG-NEXT: LSHL T8.X, T9.W, PV.W,
1791 ; EG-NEXT: LSHL * T8.W, literal.x, PV.W,
1792 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1793 ; EG-NEXT: MOV T8.Y, 0.0,
1794 ; EG-NEXT: MOV T5.Z, 0.0,
1795 ; EG-NEXT: MOV * T4.Z, 0.0,
1796 ; EG-NEXT: MOV T3.Z, 0.0,
1797 ; EG-NEXT: MOV * T6.Z, 0.0,
1798 ; EG-NEXT: MOV * T8.Z, 0.0,
1799 ; EG-NEXT: LSHR T0.X, T7.W, literal.x,
1800 ; EG-NEXT: LSHR * T1.X, T2.W, literal.x,
1801 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1802 ; EG-NEXT: LSHR T2.X, T1.W, literal.x,
1803 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1804 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1805 ; EG-NEXT: LSHR * T9.X, T0.W, literal.x,
1806 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1808 ; CM-LABEL: v5i16_arg:
1809 ; CM: ; %bb.0: ; %entry
1810 ; CM-NEXT: ALU 0, @20, KC0[], KC1[]
1811 ; CM-NEXT: TEX 4 @10
1812 ; CM-NEXT: ALU 67, @21, KC0[CB0:0-32], KC1[]
1813 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T9.X
1814 ; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X
1815 ; CM-NEXT: MEM_RAT MSKOR T3.XW, T2.X
1816 ; CM-NEXT: MEM_RAT MSKOR T6.XW, T1.X
1817 ; CM-NEXT: MEM_RAT MSKOR T8.XW, T0.X
1820 ; CM-NEXT: Fetch clause starting at 10:
1821 ; CM-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3
1822 ; CM-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3
1823 ; CM-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3
1824 ; CM-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3
1825 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
1826 ; CM-NEXT: ALU clause starting at 20:
1827 ; CM-NEXT: MOV * T0.X, 0.0,
1828 ; CM-NEXT: ALU clause starting at 21:
1829 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1830 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1831 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
1832 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1833 ; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
1834 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
1835 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1836 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
1837 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
1838 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1839 ; CM-NEXT: MOV T5.Y, 0.0,
1840 ; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1841 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1842 ; CM-NEXT: AND_INT T0.Z, T4.X, literal.x,
1843 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
1844 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1845 ; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
1846 ; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
1847 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1848 ; CM-NEXT: MOV T4.Y, 0.0,
1849 ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
1850 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1851 ; CM-NEXT: AND_INT * T2.W, PV.W, literal.x,
1852 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1853 ; CM-NEXT: AND_INT T0.Z, T3.X, literal.x,
1854 ; CM-NEXT: LSHL * T2.W, PV.W, literal.y,
1855 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1856 ; CM-NEXT: LSHL T3.X, PV.Z, PV.W,
1857 ; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
1858 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1859 ; CM-NEXT: MOV T3.Y, 0.0,
1860 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
1861 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1862 ; CM-NEXT: AND_INT * T6.W, PV.W, literal.x,
1863 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1864 ; CM-NEXT: AND_INT T0.Z, T2.X, literal.x,
1865 ; CM-NEXT: LSHL * T6.W, PV.W, literal.y,
1866 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1867 ; CM-NEXT: LSHL T6.X, PV.Z, PV.W,
1868 ; CM-NEXT: LSHL * T6.W, literal.x, PV.W,
1869 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1870 ; CM-NEXT: MOV T6.Y, 0.0,
1871 ; CM-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
1872 ; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00)
1873 ; CM-NEXT: AND_INT * T8.W, PV.W, literal.x,
1874 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1875 ; CM-NEXT: AND_INT T0.Z, T1.X, literal.x,
1876 ; CM-NEXT: LSHL * T8.W, PV.W, literal.y,
1877 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1878 ; CM-NEXT: LSHL T8.X, PV.Z, PV.W,
1879 ; CM-NEXT: LSHL * T8.W, literal.x, PV.W,
1880 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1881 ; CM-NEXT: MOV T8.Y, 0.0,
1882 ; CM-NEXT: MOV * T5.Z, 0.0,
1883 ; CM-NEXT: MOV * T4.Z, 0.0,
1884 ; CM-NEXT: MOV * T3.Z, 0.0,
1885 ; CM-NEXT: MOV * T6.Z, 0.0,
1886 ; CM-NEXT: MOV * T8.Z, 0.0,
1887 ; CM-NEXT: LSHR * T0.X, T7.W, literal.x,
1888 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1889 ; CM-NEXT: LSHR * T1.X, T2.W, literal.x,
1890 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1891 ; CM-NEXT: LSHR * T2.X, T1.W, literal.x,
1892 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1893 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1894 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1895 ; CM-NEXT: LSHR * T9.X, T0.W, literal.x,
1896 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1898 store <5 x i16> %in, ptr addrspace(1) %out, align 4
1902 define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind {
1903 ; SI-LABEL: v5i32_arg:
1904 ; SI: ; %bb.0: ; %entry
1905 ; SI-NEXT: s_load_dword s8, s[4:5], 0x15
1906 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
1907 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11
1908 ; SI-NEXT: s_mov_b32 s3, 0xf000
1909 ; SI-NEXT: s_mov_b32 s2, -1
1910 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1911 ; SI-NEXT: v_mov_b32_e32 v0, s8
1912 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
1913 ; SI-NEXT: s_waitcnt expcnt(0)
1914 ; SI-NEXT: v_mov_b32_e32 v0, s4
1915 ; SI-NEXT: v_mov_b32_e32 v1, s5
1916 ; SI-NEXT: v_mov_b32_e32 v2, s6
1917 ; SI-NEXT: v_mov_b32_e32 v3, s7
1918 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1921 ; VI-LABEL: v5i32_arg:
1922 ; VI: ; %bb.0: ; %entry
1923 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
1924 ; VI-NEXT: s_load_dword s8, s[4:5], 0x54
1925 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
1926 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1927 ; VI-NEXT: s_add_u32 s4, s6, 16
1928 ; VI-NEXT: s_addc_u32 s5, s7, 0
1929 ; VI-NEXT: v_mov_b32_e32 v0, s4
1930 ; VI-NEXT: v_mov_b32_e32 v2, s8
1931 ; VI-NEXT: v_mov_b32_e32 v1, s5
1932 ; VI-NEXT: v_mov_b32_e32 v4, s6
1933 ; VI-NEXT: flat_store_dword v[0:1], v2
1934 ; VI-NEXT: v_mov_b32_e32 v0, s0
1935 ; VI-NEXT: v_mov_b32_e32 v5, s7
1936 ; VI-NEXT: v_mov_b32_e32 v1, s1
1937 ; VI-NEXT: v_mov_b32_e32 v2, s2
1938 ; VI-NEXT: v_mov_b32_e32 v3, s3
1939 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1942 ; GFX9-LABEL: v5i32_arg:
1943 ; GFX9: ; %bb.0: ; %entry
1944 ; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30
1945 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
1946 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1947 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1948 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1949 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
1950 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1951 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1952 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1953 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1954 ; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16
1955 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
1956 ; GFX9-NEXT: s_endpgm
1958 ; EG-LABEL: v5i32_arg:
1959 ; EG: ; %bb.0: ; %entry
1960 ; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
1961 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1962 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1964 ; EG-NEXT: ALU clause starting at 4:
1965 ; EG-NEXT: MOV * T0.W, KC0[5].X,
1966 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
1967 ; EG-NEXT: MOV * T0.Y, KC0[4].Z,
1968 ; EG-NEXT: MOV T0.X, KC0[4].Y,
1969 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1970 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1971 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
1972 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1973 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1974 ; EG-NEXT: MOV * T3.X, KC0[5].Y,
1975 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1977 ; CM-LABEL: v5i32_arg:
1978 ; CM: ; %bb.0: ; %entry
1979 ; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
1980 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
1981 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
1983 ; CM-NEXT: ALU clause starting at 4:
1984 ; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
1985 ; CM-NEXT: MOV * T0.W, KC0[5].X,
1986 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1987 ; CM-NEXT: LSHR T1.X, PV.Z, literal.x,
1988 ; CM-NEXT: MOV * T0.Z, KC0[4].W,
1989 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1990 ; CM-NEXT: MOV T2.X, KC0[5].Y,
1991 ; CM-NEXT: MOV * T0.Y, KC0[4].Z,
1992 ; CM-NEXT: MOV * T0.X, KC0[4].Y,
1993 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
1994 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1996 store <5 x i32> %in, ptr addrspace(1) %out, align 4
2000 define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind {
2001 ; SI-LABEL: v5f32_arg:
2002 ; SI: ; %bb.0: ; %entry
2003 ; SI-NEXT: s_load_dword s8, s[4:5], 0x15
2004 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2005 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11
2006 ; SI-NEXT: s_mov_b32 s3, 0xf000
2007 ; SI-NEXT: s_mov_b32 s2, -1
2008 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2009 ; SI-NEXT: v_mov_b32_e32 v0, s8
2010 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
2011 ; SI-NEXT: s_waitcnt expcnt(0)
2012 ; SI-NEXT: v_mov_b32_e32 v0, s4
2013 ; SI-NEXT: v_mov_b32_e32 v1, s5
2014 ; SI-NEXT: v_mov_b32_e32 v2, s6
2015 ; SI-NEXT: v_mov_b32_e32 v3, s7
2016 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2019 ; VI-LABEL: v5f32_arg:
2020 ; VI: ; %bb.0: ; %entry
2021 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2022 ; VI-NEXT: s_load_dword s8, s[4:5], 0x54
2023 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
2024 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2025 ; VI-NEXT: s_add_u32 s4, s6, 16
2026 ; VI-NEXT: s_addc_u32 s5, s7, 0
2027 ; VI-NEXT: v_mov_b32_e32 v1, s4
2028 ; VI-NEXT: v_mov_b32_e32 v3, s8
2029 ; VI-NEXT: v_mov_b32_e32 v2, s5
2030 ; VI-NEXT: v_mov_b32_e32 v4, s6
2031 ; VI-NEXT: v_mov_b32_e32 v0, s0
2032 ; VI-NEXT: flat_store_dword v[1:2], v3
2033 ; VI-NEXT: v_mov_b32_e32 v1, s1
2034 ; VI-NEXT: v_mov_b32_e32 v2, s2
2035 ; VI-NEXT: v_mov_b32_e32 v3, s3
2036 ; VI-NEXT: v_mov_b32_e32 v5, s7
2037 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2040 ; GFX9-LABEL: v5f32_arg:
2041 ; GFX9: ; %bb.0: ; %entry
2042 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
2043 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2044 ; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30
2045 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2046 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2047 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2048 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2049 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2050 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2051 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
2052 ; GFX9-NEXT: s_nop 0
2053 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2054 ; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16
2055 ; GFX9-NEXT: s_endpgm
2057 ; EG-LABEL: v5f32_arg:
2058 ; EG: ; %bb.0: ; %entry
2059 ; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
2060 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2061 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2063 ; EG-NEXT: ALU clause starting at 4:
2064 ; EG-NEXT: MOV * T0.W, KC0[5].X,
2065 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
2066 ; EG-NEXT: MOV * T0.Y, KC0[4].Z,
2067 ; EG-NEXT: MOV T0.X, KC0[4].Y,
2068 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2069 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2070 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2071 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2072 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
2073 ; EG-NEXT: MOV * T3.X, KC0[5].Y,
2074 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2076 ; CM-LABEL: v5f32_arg:
2077 ; CM: ; %bb.0: ; %entry
2078 ; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
2079 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2080 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2082 ; CM-NEXT: ALU clause starting at 4:
2083 ; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
2084 ; CM-NEXT: MOV * T0.W, KC0[5].X,
2085 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2086 ; CM-NEXT: LSHR T1.X, PV.Z, literal.x,
2087 ; CM-NEXT: MOV * T0.Z, KC0[4].W,
2088 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2089 ; CM-NEXT: MOV T2.X, KC0[5].Y,
2090 ; CM-NEXT: MOV * T0.Y, KC0[4].Z,
2091 ; CM-NEXT: MOV * T0.X, KC0[4].Y,
2092 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2093 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2095 store <5 x float> %in, ptr addrspace(1) %out, align 4
2099 define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind {
2100 ; SI-LABEL: v5i64_arg:
2101 ; SI: ; %bb.0: ; %entry
2102 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19
2103 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2104 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21
2105 ; SI-NEXT: s_mov_b32 s3, 0xf000
2106 ; SI-NEXT: s_mov_b32 s2, -1
2107 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2108 ; SI-NEXT: v_mov_b32_e32 v0, s12
2109 ; SI-NEXT: v_mov_b32_e32 v1, s13
2110 ; SI-NEXT: v_mov_b32_e32 v2, s14
2111 ; SI-NEXT: v_mov_b32_e32 v3, s15
2112 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2113 ; SI-NEXT: s_waitcnt expcnt(0)
2114 ; SI-NEXT: v_mov_b32_e32 v0, s8
2115 ; SI-NEXT: v_mov_b32_e32 v1, s9
2116 ; SI-NEXT: v_mov_b32_e32 v2, s10
2117 ; SI-NEXT: v_mov_b32_e32 v3, s11
2118 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2119 ; SI-NEXT: s_waitcnt expcnt(0)
2120 ; SI-NEXT: v_mov_b32_e32 v0, s4
2121 ; SI-NEXT: v_mov_b32_e32 v1, s5
2122 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
2125 ; VI-LABEL: v5i64_arg:
2126 ; VI: ; %bb.0: ; %entry
2127 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
2128 ; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84
2129 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64
2130 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2131 ; VI-NEXT: s_add_u32 s12, s8, 32
2132 ; VI-NEXT: v_mov_b32_e32 v1, s10
2133 ; VI-NEXT: s_addc_u32 s13, s9, 0
2134 ; VI-NEXT: v_mov_b32_e32 v3, s12
2135 ; VI-NEXT: v_mov_b32_e32 v2, s11
2136 ; VI-NEXT: v_mov_b32_e32 v0, s4
2137 ; VI-NEXT: v_mov_b32_e32 v4, s13
2138 ; VI-NEXT: s_add_u32 s4, s8, 16
2139 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
2140 ; VI-NEXT: v_mov_b32_e32 v1, s5
2141 ; VI-NEXT: s_addc_u32 s5, s9, 0
2142 ; VI-NEXT: v_mov_b32_e32 v4, s4
2143 ; VI-NEXT: v_mov_b32_e32 v2, s6
2144 ; VI-NEXT: v_mov_b32_e32 v3, s7
2145 ; VI-NEXT: v_mov_b32_e32 v5, s5
2146 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2147 ; VI-NEXT: v_mov_b32_e32 v4, s8
2148 ; VI-NEXT: v_mov_b32_e32 v0, s0
2149 ; VI-NEXT: v_mov_b32_e32 v1, s1
2150 ; VI-NEXT: v_mov_b32_e32 v2, s2
2151 ; VI-NEXT: v_mov_b32_e32 v3, s3
2152 ; VI-NEXT: v_mov_b32_e32 v5, s9
2153 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2156 ; GFX9-LABEL: v5i64_arg:
2157 ; GFX9: ; %bb.0: ; %entry
2158 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60
2159 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40
2160 ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
2161 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2162 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2163 ; GFX9-NEXT: v_mov_b32_e32 v1, s10
2164 ; GFX9-NEXT: v_mov_b32_e32 v2, s11
2165 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2166 ; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32
2167 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2168 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
2169 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
2170 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
2171 ; GFX9-NEXT: s_nop 0
2172 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2173 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2174 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2175 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2176 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
2177 ; GFX9-NEXT: s_endpgm
2179 ; EG-LABEL: v5i64_arg:
2180 ; EG: ; %bb.0: ; %entry
2181 ; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2182 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2183 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2184 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2187 ; EG-NEXT: ALU clause starting at 6:
2188 ; EG-NEXT: MOV * T0.W, KC0[7].X,
2189 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
2190 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
2191 ; EG-NEXT: MOV * T1.W, KC0[8].X,
2192 ; EG-NEXT: MOV T0.X, KC0[6].Y,
2193 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
2194 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2195 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
2196 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2197 ; EG-NEXT: MOV T1.X, KC0[7].Y,
2198 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2199 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2200 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2201 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
2202 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2203 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
2204 ; EG-NEXT: MOV T5.Y, KC0[8].Z,
2205 ; EG-NEXT: MOV * T5.X, KC0[8].Y,
2206 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2208 ; CM-LABEL: v5i64_arg:
2209 ; CM: ; %bb.0: ; %entry
2210 ; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2211 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2212 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2213 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2216 ; CM-NEXT: ALU clause starting at 6:
2217 ; CM-NEXT: MOV * T0.W, KC0[8].X,
2218 ; CM-NEXT: MOV T1.Y, KC0[8].Z,
2219 ; CM-NEXT: MOV * T0.Z, KC0[7].W,
2220 ; CM-NEXT: MOV T1.X, KC0[8].Y,
2221 ; CM-NEXT: MOV * T0.Y, KC0[7].Z,
2222 ; CM-NEXT: MOV T0.X, KC0[7].Y,
2223 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
2224 ; CM-NEXT: MOV * T2.W, KC0[7].X,
2225 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
2226 ; CM-NEXT: LSHR T3.X, PV.Z, literal.x,
2227 ; CM-NEXT: MOV T2.Z, KC0[6].W,
2228 ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
2229 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2230 ; CM-NEXT: LSHR T4.X, PV.W, literal.x,
2231 ; CM-NEXT: MOV * T2.Y, KC0[6].Z,
2232 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2233 ; CM-NEXT: MOV * T2.X, KC0[6].Y,
2234 ; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
2235 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2237 store <5 x i64> %in, ptr addrspace(1) %out, align 8
2241 define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
2242 ; SI-LABEL: v5f64_arg:
2243 ; SI: ; %bb.0: ; %entry
2244 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19
2245 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2246 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21
2247 ; SI-NEXT: s_mov_b32 s3, 0xf000
2248 ; SI-NEXT: s_mov_b32 s2, -1
2249 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2250 ; SI-NEXT: v_mov_b32_e32 v0, s12
2251 ; SI-NEXT: v_mov_b32_e32 v1, s13
2252 ; SI-NEXT: v_mov_b32_e32 v2, s14
2253 ; SI-NEXT: v_mov_b32_e32 v3, s15
2254 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2255 ; SI-NEXT: s_waitcnt expcnt(0)
2256 ; SI-NEXT: v_mov_b32_e32 v0, s8
2257 ; SI-NEXT: v_mov_b32_e32 v1, s9
2258 ; SI-NEXT: v_mov_b32_e32 v2, s10
2259 ; SI-NEXT: v_mov_b32_e32 v3, s11
2260 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2261 ; SI-NEXT: s_waitcnt expcnt(0)
2262 ; SI-NEXT: v_mov_b32_e32 v0, s4
2263 ; SI-NEXT: v_mov_b32_e32 v1, s5
2264 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
2267 ; VI-LABEL: v5f64_arg:
2268 ; VI: ; %bb.0: ; %entry
2269 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
2270 ; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84
2271 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64
2272 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2273 ; VI-NEXT: s_add_u32 s12, s8, 32
2274 ; VI-NEXT: v_mov_b32_e32 v1, s10
2275 ; VI-NEXT: s_addc_u32 s13, s9, 0
2276 ; VI-NEXT: v_mov_b32_e32 v3, s12
2277 ; VI-NEXT: v_mov_b32_e32 v2, s11
2278 ; VI-NEXT: v_mov_b32_e32 v0, s4
2279 ; VI-NEXT: v_mov_b32_e32 v4, s13
2280 ; VI-NEXT: s_add_u32 s4, s8, 16
2281 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
2282 ; VI-NEXT: v_mov_b32_e32 v1, s5
2283 ; VI-NEXT: s_addc_u32 s5, s9, 0
2284 ; VI-NEXT: v_mov_b32_e32 v4, s4
2285 ; VI-NEXT: v_mov_b32_e32 v2, s6
2286 ; VI-NEXT: v_mov_b32_e32 v3, s7
2287 ; VI-NEXT: v_mov_b32_e32 v5, s5
2288 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2289 ; VI-NEXT: v_mov_b32_e32 v4, s8
2290 ; VI-NEXT: v_mov_b32_e32 v0, s0
2291 ; VI-NEXT: v_mov_b32_e32 v1, s1
2292 ; VI-NEXT: v_mov_b32_e32 v2, s2
2293 ; VI-NEXT: v_mov_b32_e32 v3, s3
2294 ; VI-NEXT: v_mov_b32_e32 v5, s9
2295 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2298 ; GFX9-LABEL: v5f64_arg:
2299 ; GFX9: ; %bb.0: ; %entry
2300 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60
2301 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40
2302 ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
2303 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2304 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2305 ; GFX9-NEXT: v_mov_b32_e32 v1, s10
2306 ; GFX9-NEXT: v_mov_b32_e32 v2, s11
2307 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2308 ; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32
2309 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2310 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
2311 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
2312 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
2313 ; GFX9-NEXT: s_nop 0
2314 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2315 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2316 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2317 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2318 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
2319 ; GFX9-NEXT: s_endpgm
2321 ; EG-LABEL: v5f64_arg:
2322 ; EG: ; %bb.0: ; %entry
2323 ; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2324 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2325 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2326 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2329 ; EG-NEXT: ALU clause starting at 6:
2330 ; EG-NEXT: MOV * T0.W, KC0[7].X,
2331 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
2332 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
2333 ; EG-NEXT: MOV * T1.W, KC0[8].X,
2334 ; EG-NEXT: MOV T0.X, KC0[6].Y,
2335 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
2336 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2337 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
2338 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2339 ; EG-NEXT: MOV T1.X, KC0[7].Y,
2340 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2341 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2342 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2343 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
2344 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2345 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
2346 ; EG-NEXT: MOV T5.Y, KC0[8].Z,
2347 ; EG-NEXT: MOV * T5.X, KC0[8].Y,
2348 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2350 ; CM-LABEL: v5f64_arg:
2351 ; CM: ; %bb.0: ; %entry
2352 ; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2353 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2354 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2355 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2358 ; CM-NEXT: ALU clause starting at 6:
2359 ; CM-NEXT: MOV * T0.W, KC0[8].X,
2360 ; CM-NEXT: MOV T1.Y, KC0[8].Z,
2361 ; CM-NEXT: MOV * T0.Z, KC0[7].W,
2362 ; CM-NEXT: MOV T1.X, KC0[8].Y,
2363 ; CM-NEXT: MOV * T0.Y, KC0[7].Z,
2364 ; CM-NEXT: MOV T0.X, KC0[7].Y,
2365 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
2366 ; CM-NEXT: MOV * T2.W, KC0[7].X,
2367 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
2368 ; CM-NEXT: LSHR T3.X, PV.Z, literal.x,
2369 ; CM-NEXT: MOV T2.Z, KC0[6].W,
2370 ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
2371 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2372 ; CM-NEXT: LSHR T4.X, PV.W, literal.x,
2373 ; CM-NEXT: MOV * T2.Y, KC0[6].Z,
2374 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2375 ; CM-NEXT: MOV * T2.X, KC0[6].Y,
2376 ; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
2377 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2379 store <5 x double> %in, ptr addrspace(1) %out, align 8
2383 ; FIXME: Lots of unpack and re-pack junk on VI
2384 define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
2385 ; SI-LABEL: v8i8_arg:
2386 ; SI: ; %bb.0: ; %entry
2387 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2388 ; SI-NEXT: s_mov_b32 s7, 0xf000
2389 ; SI-NEXT: s_mov_b32 s6, -1
2390 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2391 ; SI-NEXT: s_mov_b32 s4, s0
2392 ; SI-NEXT: s_mov_b32 s5, s1
2393 ; SI-NEXT: v_mov_b32_e32 v0, s2
2394 ; SI-NEXT: v_mov_b32_e32 v1, s3
2395 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2398 ; VI-LABEL: v8i8_arg:
2399 ; VI: ; %bb.0: ; %entry
2400 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2401 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2402 ; VI-NEXT: v_mov_b32_e32 v0, s0
2403 ; VI-NEXT: v_mov_b32_e32 v2, s2
2404 ; VI-NEXT: v_mov_b32_e32 v1, s1
2405 ; VI-NEXT: v_mov_b32_e32 v3, s3
2406 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2409 ; GFX9-LABEL: v8i8_arg:
2410 ; GFX9: ; %bb.0: ; %entry
2411 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2412 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2413 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2414 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2415 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2416 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
2417 ; GFX9-NEXT: s_endpgm
2419 ; EG-LABEL: v8i8_arg:
2420 ; EG: ; %bb.0: ; %entry
2421 ; EG-NEXT: ALU 1, @36, KC0[], KC1[]
2422 ; EG-NEXT: TEX 0 @20
2423 ; EG-NEXT: ALU 5, @38, KC0[], KC1[]
2424 ; EG-NEXT: TEX 0 @22
2425 ; EG-NEXT: ALU 5, @44, KC0[], KC1[]
2426 ; EG-NEXT: TEX 0 @24
2427 ; EG-NEXT: ALU 7, @50, KC0[], KC1[]
2428 ; EG-NEXT: TEX 0 @26
2429 ; EG-NEXT: ALU 7, @58, KC0[], KC1[]
2430 ; EG-NEXT: TEX 0 @28
2431 ; EG-NEXT: ALU 7, @66, KC0[], KC1[]
2432 ; EG-NEXT: TEX 0 @30
2433 ; EG-NEXT: ALU 7, @74, KC0[], KC1[]
2434 ; EG-NEXT: TEX 0 @32
2435 ; EG-NEXT: ALU 5, @82, KC0[], KC1[]
2436 ; EG-NEXT: TEX 0 @34
2437 ; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
2438 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
2441 ; EG-NEXT: Fetch clause starting at 20:
2442 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
2443 ; EG-NEXT: Fetch clause starting at 22:
2444 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
2445 ; EG-NEXT: Fetch clause starting at 24:
2446 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
2447 ; EG-NEXT: Fetch clause starting at 26:
2448 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
2449 ; EG-NEXT: Fetch clause starting at 28:
2450 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
2451 ; EG-NEXT: Fetch clause starting at 30:
2452 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
2453 ; EG-NEXT: Fetch clause starting at 32:
2454 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
2455 ; EG-NEXT: Fetch clause starting at 34:
2456 ; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
2457 ; EG-NEXT: ALU clause starting at 36:
2458 ; EG-NEXT: MOV * T0.Y, T2.X,
2459 ; EG-NEXT: MOV * T5.X, 0.0,
2460 ; EG-NEXT: ALU clause starting at 38:
2461 ; EG-NEXT: LSHL T0.W, T6.X, literal.x,
2462 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2463 ; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
2464 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
2465 ; EG-NEXT: MOV T2.X, PV.W,
2466 ; EG-NEXT: MOV * T0.Y, T3.X,
2467 ; EG-NEXT: ALU clause starting at 44:
2468 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2469 ; EG-NEXT: LSHL * T1.W, T6.X, literal.y,
2470 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
2471 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2472 ; EG-NEXT: MOV T3.X, PV.W,
2473 ; EG-NEXT: MOV * T0.Y, T2.X,
2474 ; EG-NEXT: ALU clause starting at 50:
2475 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2476 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2477 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
2478 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2479 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2480 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2481 ; EG-NEXT: MOV T2.X, PV.W,
2482 ; EG-NEXT: MOV * T0.Y, T3.X,
2483 ; EG-NEXT: ALU clause starting at 58:
2484 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2485 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2486 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
2487 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2488 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2489 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2490 ; EG-NEXT: MOV T3.X, PV.W,
2491 ; EG-NEXT: MOV * T0.Y, T2.X,
2492 ; EG-NEXT: ALU clause starting at 66:
2493 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2494 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2495 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
2496 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2497 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2498 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2499 ; EG-NEXT: MOV T2.X, PV.W,
2500 ; EG-NEXT: MOV * T0.Y, T3.X,
2501 ; EG-NEXT: ALU clause starting at 74:
2502 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2503 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2504 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
2505 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2506 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2507 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2508 ; EG-NEXT: MOV T3.X, PV.W,
2509 ; EG-NEXT: MOV * T0.Y, T2.X,
2510 ; EG-NEXT: ALU clause starting at 82:
2511 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2512 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.y,
2513 ; EG-NEXT: -256(nan), 255(3.573311e-43)
2514 ; EG-NEXT: OR_INT * T5.Y, PV.W, PS,
2515 ; EG-NEXT: MOV T2.X, PV.Y,
2516 ; EG-NEXT: MOV * T0.Y, T3.X,
2517 ; EG-NEXT: ALU clause starting at 88:
2518 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2519 ; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
2520 ; EG-NEXT: -256(nan), 255(3.573311e-43)
2521 ; EG-NEXT: OR_INT T5.X, PV.W, PS,
2522 ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
2523 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2525 ; CM-LABEL: v8i8_arg:
2526 ; CM: ; %bb.0: ; %entry
2527 ; CM-NEXT: ALU 1, @36, KC0[], KC1[]
2528 ; CM-NEXT: TEX 0 @20
2529 ; CM-NEXT: ALU 5, @38, KC0[], KC1[]
2530 ; CM-NEXT: TEX 0 @22
2531 ; CM-NEXT: ALU 5, @44, KC0[], KC1[]
2532 ; CM-NEXT: TEX 0 @24
2533 ; CM-NEXT: ALU 7, @50, KC0[], KC1[]
2534 ; CM-NEXT: TEX 0 @26
2535 ; CM-NEXT: ALU 7, @58, KC0[], KC1[]
2536 ; CM-NEXT: TEX 0 @28
2537 ; CM-NEXT: ALU 7, @66, KC0[], KC1[]
2538 ; CM-NEXT: TEX 0 @30
2539 ; CM-NEXT: ALU 7, @74, KC0[], KC1[]
2540 ; CM-NEXT: TEX 0 @32
2541 ; CM-NEXT: ALU 5, @82, KC0[], KC1[]
2542 ; CM-NEXT: TEX 0 @34
2543 ; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
2544 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
2547 ; CM-NEXT: Fetch clause starting at 20:
2548 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
2549 ; CM-NEXT: Fetch clause starting at 22:
2550 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
2551 ; CM-NEXT: Fetch clause starting at 24:
2552 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
2553 ; CM-NEXT: Fetch clause starting at 26:
2554 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
2555 ; CM-NEXT: Fetch clause starting at 28:
2556 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
2557 ; CM-NEXT: Fetch clause starting at 30:
2558 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
2559 ; CM-NEXT: Fetch clause starting at 32:
2560 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
2561 ; CM-NEXT: Fetch clause starting at 34:
2562 ; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
2563 ; CM-NEXT: ALU clause starting at 36:
2564 ; CM-NEXT: MOV * T0.Y, T2.X,
2565 ; CM-NEXT: MOV * T5.X, 0.0,
2566 ; CM-NEXT: ALU clause starting at 38:
2567 ; CM-NEXT: LSHL T0.Z, T6.X, literal.x,
2568 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
2569 ; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
2570 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
2571 ; CM-NEXT: MOV T2.X, PV.W,
2572 ; CM-NEXT: MOV * T0.Y, T3.X,
2573 ; CM-NEXT: ALU clause starting at 44:
2574 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2575 ; CM-NEXT: LSHL * T0.W, T6.X, literal.y,
2576 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
2577 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2578 ; CM-NEXT: MOV T3.X, PV.W,
2579 ; CM-NEXT: MOV * T0.Y, T2.X,
2580 ; CM-NEXT: ALU clause starting at 50:
2581 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2582 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2583 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2584 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2585 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
2586 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2587 ; CM-NEXT: MOV T2.X, PV.W,
2588 ; CM-NEXT: MOV * T0.Y, T3.X,
2589 ; CM-NEXT: ALU clause starting at 58:
2590 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2591 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2592 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2593 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2594 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
2595 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2596 ; CM-NEXT: MOV T3.X, PV.W,
2597 ; CM-NEXT: MOV * T0.Y, T2.X,
2598 ; CM-NEXT: ALU clause starting at 66:
2599 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2600 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2601 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2602 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2603 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
2604 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2605 ; CM-NEXT: MOV T2.X, PV.W,
2606 ; CM-NEXT: MOV * T0.Y, T3.X,
2607 ; CM-NEXT: ALU clause starting at 74:
2608 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2609 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2610 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2611 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2612 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
2613 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2614 ; CM-NEXT: MOV T3.X, PV.W,
2615 ; CM-NEXT: MOV * T0.Y, T2.X,
2616 ; CM-NEXT: ALU clause starting at 82:
2617 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2618 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
2619 ; CM-NEXT: -256(nan), 255(3.573311e-43)
2620 ; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W,
2621 ; CM-NEXT: MOV T2.X, PV.Y,
2622 ; CM-NEXT: MOV * T0.Y, T3.X,
2623 ; CM-NEXT: ALU clause starting at 88:
2624 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2625 ; CM-NEXT: AND_INT * T0.W, T5.X, literal.y,
2626 ; CM-NEXT: -256(nan), 255(3.573311e-43)
2627 ; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W,
2628 ; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
2629 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2631 store <8 x i8> %in, ptr addrspace(1) %out
2635 define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
2636 ; SI-LABEL: v8i16_arg:
2637 ; SI: ; %bb.0: ; %entry
2638 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
2639 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
2640 ; SI-NEXT: s_mov_b32 s7, 0xf000
2641 ; SI-NEXT: s_mov_b32 s6, -1
2642 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2643 ; SI-NEXT: v_mov_b32_e32 v0, s0
2644 ; SI-NEXT: v_mov_b32_e32 v1, s1
2645 ; SI-NEXT: v_mov_b32_e32 v2, s2
2646 ; SI-NEXT: v_mov_b32_e32 v3, s3
2647 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2650 ; VI-LABEL: v8i16_arg:
2651 ; VI: ; %bb.0: ; %entry
2652 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2653 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
2654 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2655 ; VI-NEXT: v_mov_b32_e32 v4, s6
2656 ; VI-NEXT: v_mov_b32_e32 v0, s0
2657 ; VI-NEXT: v_mov_b32_e32 v5, s7
2658 ; VI-NEXT: v_mov_b32_e32 v1, s1
2659 ; VI-NEXT: v_mov_b32_e32 v2, s2
2660 ; VI-NEXT: v_mov_b32_e32 v3, s3
2661 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2664 ; GFX9-LABEL: v8i16_arg:
2665 ; GFX9: ; %bb.0: ; %entry
2666 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
2667 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2668 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2669 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2670 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2671 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2672 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2673 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2674 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
2675 ; GFX9-NEXT: s_endpgm
2677 ; EG-LABEL: v8i16_arg:
2678 ; EG: ; %bb.0: ; %entry
2679 ; EG-NEXT: ALU 1, @36, KC0[], KC1[]
2680 ; EG-NEXT: TEX 0 @20
2681 ; EG-NEXT: ALU 5, @38, KC0[], KC1[]
2682 ; EG-NEXT: TEX 0 @22
2683 ; EG-NEXT: ALU 5, @44, KC0[], KC1[]
2684 ; EG-NEXT: TEX 0 @24
2685 ; EG-NEXT: ALU 5, @50, KC0[], KC1[]
2686 ; EG-NEXT: TEX 0 @26
2687 ; EG-NEXT: ALU 5, @56, KC0[], KC1[]
2688 ; EG-NEXT: TEX 0 @28
2689 ; EG-NEXT: ALU 5, @62, KC0[], KC1[]
2690 ; EG-NEXT: TEX 0 @30
2691 ; EG-NEXT: ALU 5, @68, KC0[], KC1[]
2692 ; EG-NEXT: TEX 0 @32
2693 ; EG-NEXT: ALU 5, @74, KC0[], KC1[]
2694 ; EG-NEXT: TEX 0 @34
2695 ; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
2696 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2699 ; EG-NEXT: Fetch clause starting at 20:
2700 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
2701 ; EG-NEXT: Fetch clause starting at 22:
2702 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
2703 ; EG-NEXT: Fetch clause starting at 24:
2704 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
2705 ; EG-NEXT: Fetch clause starting at 26:
2706 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
2707 ; EG-NEXT: Fetch clause starting at 28:
2708 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
2709 ; EG-NEXT: Fetch clause starting at 30:
2710 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
2711 ; EG-NEXT: Fetch clause starting at 32:
2712 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
2713 ; EG-NEXT: Fetch clause starting at 34:
2714 ; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
2715 ; EG-NEXT: ALU clause starting at 36:
2716 ; EG-NEXT: MOV * T0.Y, T3.X,
2717 ; EG-NEXT: MOV * T7.X, 0.0,
2718 ; EG-NEXT: ALU clause starting at 38:
2719 ; EG-NEXT: LSHL T0.W, T8.X, literal.x,
2720 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2721 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
2722 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
2723 ; EG-NEXT: MOV T3.X, PV.W,
2724 ; EG-NEXT: MOV * T0.Y, T5.X,
2725 ; EG-NEXT: ALU clause starting at 44:
2726 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2727 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
2728 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2729 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2730 ; EG-NEXT: MOV T5.X, PV.W,
2731 ; EG-NEXT: MOV * T0.Y, T3.X,
2732 ; EG-NEXT: ALU clause starting at 50:
2733 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2734 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
2735 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
2736 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2737 ; EG-NEXT: MOV T3.X, PV.W,
2738 ; EG-NEXT: MOV * T0.Y, T5.X,
2739 ; EG-NEXT: ALU clause starting at 56:
2740 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2741 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
2742 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
2743 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2744 ; EG-NEXT: MOV T5.X, PV.W,
2745 ; EG-NEXT: MOV * T0.Y, T2.X,
2746 ; EG-NEXT: ALU clause starting at 62:
2747 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2748 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
2749 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2750 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2751 ; EG-NEXT: MOV T2.X, PV.W,
2752 ; EG-NEXT: MOV * T0.Y, T4.X,
2753 ; EG-NEXT: ALU clause starting at 68:
2754 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2755 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
2756 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2757 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2758 ; EG-NEXT: MOV T4.X, PV.W,
2759 ; EG-NEXT: MOV * T0.Y, T2.X,
2760 ; EG-NEXT: ALU clause starting at 74:
2761 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2762 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
2763 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
2764 ; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
2765 ; EG-NEXT: MOV T2.X, PV.Z,
2766 ; EG-NEXT: MOV * T0.Y, T4.X,
2767 ; EG-NEXT: ALU clause starting at 80:
2768 ; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
2769 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
2770 ; EG-NEXT: AND_INT * T1.W, T7.X, literal.z,
2771 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
2772 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2773 ; EG-NEXT: OR_INT * T7.X, PV.W, PS,
2774 ; EG-NEXT: MOV T4.X, PV.X,
2775 ; EG-NEXT: MOV * T7.W, T3.X,
2776 ; EG-NEXT: MOV * T7.Y, T5.X,
2778 ; CM-LABEL: v8i16_arg:
2779 ; CM: ; %bb.0: ; %entry
2780 ; CM-NEXT: ALU 1, @36, KC0[], KC1[]
2781 ; CM-NEXT: TEX 0 @20
2782 ; CM-NEXT: ALU 5, @38, KC0[], KC1[]
2783 ; CM-NEXT: TEX 0 @22
2784 ; CM-NEXT: ALU 5, @44, KC0[], KC1[]
2785 ; CM-NEXT: TEX 0 @24
2786 ; CM-NEXT: ALU 5, @50, KC0[], KC1[]
2787 ; CM-NEXT: TEX 0 @26
2788 ; CM-NEXT: ALU 5, @56, KC0[], KC1[]
2789 ; CM-NEXT: TEX 0 @28
2790 ; CM-NEXT: ALU 5, @62, KC0[], KC1[]
2791 ; CM-NEXT: TEX 0 @30
2792 ; CM-NEXT: ALU 5, @68, KC0[], KC1[]
2793 ; CM-NEXT: TEX 0 @32
2794 ; CM-NEXT: ALU 5, @74, KC0[], KC1[]
2795 ; CM-NEXT: TEX 0 @34
2796 ; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
2797 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
2800 ; CM-NEXT: Fetch clause starting at 20:
2801 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
2802 ; CM-NEXT: Fetch clause starting at 22:
2803 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
2804 ; CM-NEXT: Fetch clause starting at 24:
2805 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
2806 ; CM-NEXT: Fetch clause starting at 26:
2807 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
2808 ; CM-NEXT: Fetch clause starting at 28:
2809 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
2810 ; CM-NEXT: Fetch clause starting at 30:
2811 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
2812 ; CM-NEXT: Fetch clause starting at 32:
2813 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
2814 ; CM-NEXT: Fetch clause starting at 34:
2815 ; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
2816 ; CM-NEXT: ALU clause starting at 36:
2817 ; CM-NEXT: MOV * T0.Y, T3.X,
2818 ; CM-NEXT: MOV * T7.X, 0.0,
2819 ; CM-NEXT: ALU clause starting at 38:
2820 ; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
2821 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
2822 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
2823 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
2824 ; CM-NEXT: MOV T3.X, PV.W,
2825 ; CM-NEXT: MOV * T0.Y, T5.X,
2826 ; CM-NEXT: ALU clause starting at 44:
2827 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2828 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
2829 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2830 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2831 ; CM-NEXT: MOV T5.X, PV.W,
2832 ; CM-NEXT: MOV * T0.Y, T3.X,
2833 ; CM-NEXT: ALU clause starting at 50:
2834 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2835 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
2836 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
2837 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2838 ; CM-NEXT: MOV T3.X, PV.W,
2839 ; CM-NEXT: MOV * T0.Y, T5.X,
2840 ; CM-NEXT: ALU clause starting at 56:
2841 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2842 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
2843 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
2844 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2845 ; CM-NEXT: MOV T5.X, PV.W,
2846 ; CM-NEXT: MOV * T0.Y, T2.X,
2847 ; CM-NEXT: ALU clause starting at 62:
2848 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2849 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
2850 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2851 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2852 ; CM-NEXT: MOV T2.X, PV.W,
2853 ; CM-NEXT: MOV * T0.Y, T4.X,
2854 ; CM-NEXT: ALU clause starting at 68:
2855 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2856 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
2857 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2858 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2859 ; CM-NEXT: MOV T4.X, PV.W,
2860 ; CM-NEXT: MOV * T0.Y, T2.X,
2861 ; CM-NEXT: ALU clause starting at 74:
2862 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2863 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
2864 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
2865 ; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
2866 ; CM-NEXT: MOV T2.X, PV.Z,
2867 ; CM-NEXT: MOV * T0.Y, T4.X,
2868 ; CM-NEXT: ALU clause starting at 80:
2869 ; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
2870 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
2871 ; CM-NEXT: AND_INT * T0.W, T7.X, literal.z,
2872 ; CM-NEXT: 2(2.802597e-45), -65536(nan)
2873 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2874 ; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
2875 ; CM-NEXT: MOV T4.X, PV.X,
2876 ; CM-NEXT: MOV * T7.W, T3.X,
2877 ; CM-NEXT: MOV * T7.Y, T5.X,
2879 store <8 x i16> %in, ptr addrspace(1) %out
2883 define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
2884 ; SI-LABEL: v8i32_arg:
2885 ; SI: ; %bb.0: ; %entry
2886 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
2887 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2888 ; SI-NEXT: s_mov_b32 s3, 0xf000
2889 ; SI-NEXT: s_mov_b32 s2, -1
2890 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2891 ; SI-NEXT: v_mov_b32_e32 v0, s12
2892 ; SI-NEXT: v_mov_b32_e32 v1, s13
2893 ; SI-NEXT: v_mov_b32_e32 v2, s14
2894 ; SI-NEXT: v_mov_b32_e32 v3, s15
2895 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2896 ; SI-NEXT: s_waitcnt expcnt(0)
2897 ; SI-NEXT: v_mov_b32_e32 v0, s8
2898 ; SI-NEXT: v_mov_b32_e32 v1, s9
2899 ; SI-NEXT: v_mov_b32_e32 v2, s10
2900 ; SI-NEXT: v_mov_b32_e32 v3, s11
2901 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2904 ; VI-LABEL: v8i32_arg:
2905 ; VI: ; %bb.0: ; %entry
2906 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
2907 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2908 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2909 ; VI-NEXT: v_mov_b32_e32 v0, s12
2910 ; VI-NEXT: s_add_u32 s2, s0, 16
2911 ; VI-NEXT: s_addc_u32 s3, s1, 0
2912 ; VI-NEXT: v_mov_b32_e32 v5, s3
2913 ; VI-NEXT: v_mov_b32_e32 v1, s13
2914 ; VI-NEXT: v_mov_b32_e32 v2, s14
2915 ; VI-NEXT: v_mov_b32_e32 v3, s15
2916 ; VI-NEXT: v_mov_b32_e32 v4, s2
2917 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2918 ; VI-NEXT: v_mov_b32_e32 v5, s1
2919 ; VI-NEXT: v_mov_b32_e32 v0, s8
2920 ; VI-NEXT: v_mov_b32_e32 v1, s9
2921 ; VI-NEXT: v_mov_b32_e32 v2, s10
2922 ; VI-NEXT: v_mov_b32_e32 v3, s11
2923 ; VI-NEXT: v_mov_b32_e32 v4, s0
2924 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2927 ; GFX9-LABEL: v8i32_arg:
2928 ; GFX9: ; %bb.0: ; %entry
2929 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
2930 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2931 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
2932 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2933 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2934 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2935 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
2936 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
2937 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
2938 ; GFX9-NEXT: s_nop 0
2939 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2940 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2941 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2942 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2943 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
2944 ; GFX9-NEXT: s_endpgm
2946 ; EG-LABEL: v8i32_arg:
2947 ; EG: ; %bb.0: ; %entry
2948 ; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
2949 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2950 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2952 ; EG-NEXT: ALU clause starting at 4:
2953 ; EG-NEXT: MOV * T0.W, KC0[5].X,
2954 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
2955 ; EG-NEXT: MOV T0.Y, KC0[4].Z,
2956 ; EG-NEXT: MOV * T1.W, KC0[6].X,
2957 ; EG-NEXT: MOV T0.X, KC0[4].Y,
2958 ; EG-NEXT: MOV * T1.Z, KC0[5].W,
2959 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2960 ; EG-NEXT: MOV * T1.Y, KC0[5].Z,
2961 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2962 ; EG-NEXT: MOV T1.X, KC0[5].Y,
2963 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2964 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2965 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
2966 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2968 ; CM-LABEL: v8i32_arg:
2969 ; CM: ; %bb.0: ; %entry
2970 ; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
2971 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2972 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
2974 ; CM-NEXT: ALU clause starting at 4:
2975 ; CM-NEXT: MOV * T0.W, KC0[6].X,
2976 ; CM-NEXT: MOV * T0.Z, KC0[5].W,
2977 ; CM-NEXT: MOV * T0.Y, KC0[5].Z,
2978 ; CM-NEXT: MOV T0.X, KC0[5].Y,
2979 ; CM-NEXT: MOV * T1.W, KC0[5].X,
2980 ; CM-NEXT: MOV T1.Z, KC0[4].W,
2981 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2982 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2983 ; CM-NEXT: LSHR T2.X, PV.W, literal.x,
2984 ; CM-NEXT: MOV * T1.Y, KC0[4].Z,
2985 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2986 ; CM-NEXT: MOV * T1.X, KC0[4].Y,
2987 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2988 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2990 store <8 x i32> %in, ptr addrspace(1) %out, align 4
2994 define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind {
2995 ; SI-LABEL: v8f32_arg:
2996 ; SI: ; %bb.0: ; %entry
2997 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
2998 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2999 ; SI-NEXT: s_mov_b32 s3, 0xf000
3000 ; SI-NEXT: s_mov_b32 s2, -1
3001 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3002 ; SI-NEXT: v_mov_b32_e32 v0, s12
3003 ; SI-NEXT: v_mov_b32_e32 v1, s13
3004 ; SI-NEXT: v_mov_b32_e32 v2, s14
3005 ; SI-NEXT: v_mov_b32_e32 v3, s15
3006 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3007 ; SI-NEXT: s_waitcnt expcnt(0)
3008 ; SI-NEXT: v_mov_b32_e32 v0, s8
3009 ; SI-NEXT: v_mov_b32_e32 v1, s9
3010 ; SI-NEXT: v_mov_b32_e32 v2, s10
3011 ; SI-NEXT: v_mov_b32_e32 v3, s11
3012 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3015 ; VI-LABEL: v8f32_arg:
3016 ; VI: ; %bb.0: ; %entry
3017 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
3018 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3019 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3020 ; VI-NEXT: v_mov_b32_e32 v0, s12
3021 ; VI-NEXT: s_add_u32 s2, s0, 16
3022 ; VI-NEXT: s_addc_u32 s3, s1, 0
3023 ; VI-NEXT: v_mov_b32_e32 v5, s3
3024 ; VI-NEXT: v_mov_b32_e32 v1, s13
3025 ; VI-NEXT: v_mov_b32_e32 v2, s14
3026 ; VI-NEXT: v_mov_b32_e32 v3, s15
3027 ; VI-NEXT: v_mov_b32_e32 v4, s2
3028 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3029 ; VI-NEXT: v_mov_b32_e32 v5, s1
3030 ; VI-NEXT: v_mov_b32_e32 v0, s8
3031 ; VI-NEXT: v_mov_b32_e32 v1, s9
3032 ; VI-NEXT: v_mov_b32_e32 v2, s10
3033 ; VI-NEXT: v_mov_b32_e32 v3, s11
3034 ; VI-NEXT: v_mov_b32_e32 v4, s0
3035 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3038 ; GFX9-LABEL: v8f32_arg:
3039 ; GFX9: ; %bb.0: ; %entry
3040 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
3041 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3042 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
3043 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3044 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3045 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3046 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
3047 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
3048 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
3049 ; GFX9-NEXT: s_nop 0
3050 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
3051 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
3052 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
3053 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3054 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
3055 ; GFX9-NEXT: s_endpgm
3057 ; EG-LABEL: v8f32_arg:
3058 ; EG: ; %bb.0: ; %entry
3059 ; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
3060 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3061 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3063 ; EG-NEXT: ALU clause starting at 4:
3064 ; EG-NEXT: MOV * T0.W, KC0[5].X,
3065 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
3066 ; EG-NEXT: MOV T0.Y, KC0[4].Z,
3067 ; EG-NEXT: MOV * T1.W, KC0[6].X,
3068 ; EG-NEXT: MOV T0.X, KC0[4].Y,
3069 ; EG-NEXT: MOV * T1.Z, KC0[5].W,
3070 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
3071 ; EG-NEXT: MOV * T1.Y, KC0[5].Z,
3072 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3073 ; EG-NEXT: MOV T1.X, KC0[5].Y,
3074 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
3075 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3076 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
3077 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3079 ; CM-LABEL: v8f32_arg:
3080 ; CM: ; %bb.0: ; %entry
3081 ; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
3082 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3083 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3085 ; CM-NEXT: ALU clause starting at 4:
3086 ; CM-NEXT: MOV * T0.W, KC0[6].X,
3087 ; CM-NEXT: MOV * T0.Z, KC0[5].W,
3088 ; CM-NEXT: MOV * T0.Y, KC0[5].Z,
3089 ; CM-NEXT: MOV T0.X, KC0[5].Y,
3090 ; CM-NEXT: MOV * T1.W, KC0[5].X,
3091 ; CM-NEXT: MOV T1.Z, KC0[4].W,
3092 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
3093 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3094 ; CM-NEXT: LSHR T2.X, PV.W, literal.x,
3095 ; CM-NEXT: MOV * T1.Y, KC0[4].Z,
3096 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3097 ; CM-NEXT: MOV * T1.X, KC0[4].Y,
3098 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
3099 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3101 store <8 x float> %in, ptr addrspace(1) %out, align 4
3105 ; FIXME: Pack/repack on VI
3106 define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
3107 ; SI-LABEL: v16i8_arg:
3108 ; SI: ; %bb.0: ; %entry
3109 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
3110 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
3111 ; SI-NEXT: s_mov_b32 s7, 0xf000
3112 ; SI-NEXT: s_mov_b32 s6, -1
3113 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3114 ; SI-NEXT: v_mov_b32_e32 v0, s0
3115 ; SI-NEXT: v_mov_b32_e32 v1, s1
3116 ; SI-NEXT: v_mov_b32_e32 v2, s2
3117 ; SI-NEXT: v_mov_b32_e32 v3, s3
3118 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
3121 ; VI-LABEL: v16i8_arg:
3122 ; VI: ; %bb.0: ; %entry
3123 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
3124 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
3125 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3126 ; VI-NEXT: v_mov_b32_e32 v4, s6
3127 ; VI-NEXT: v_mov_b32_e32 v0, s0
3128 ; VI-NEXT: v_mov_b32_e32 v5, s7
3129 ; VI-NEXT: v_mov_b32_e32 v1, s1
3130 ; VI-NEXT: v_mov_b32_e32 v2, s2
3131 ; VI-NEXT: v_mov_b32_e32 v3, s3
3132 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3135 ; GFX9-LABEL: v16i8_arg:
3136 ; GFX9: ; %bb.0: ; %entry
3137 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
3138 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3139 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3140 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3141 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
3142 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
3143 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
3144 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3145 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
3146 ; GFX9-NEXT: s_endpgm
3148 ; EG-LABEL: v16i8_arg:
3149 ; EG: ; %bb.0: ; %entry
3150 ; EG-NEXT: ALU 1, @68, KC0[], KC1[]
3151 ; EG-NEXT: TEX 0 @36
3152 ; EG-NEXT: ALU 5, @70, KC0[], KC1[]
3153 ; EG-NEXT: TEX 0 @38
3154 ; EG-NEXT: ALU 5, @76, KC0[], KC1[]
3155 ; EG-NEXT: TEX 0 @40
3156 ; EG-NEXT: ALU 5, @82, KC0[], KC1[]
3157 ; EG-NEXT: TEX 0 @42
3158 ; EG-NEXT: ALU 5, @88, KC0[], KC1[]
3159 ; EG-NEXT: TEX 0 @44
3160 ; EG-NEXT: ALU 7, @94, KC0[], KC1[]
3161 ; EG-NEXT: TEX 0 @46
3162 ; EG-NEXT: ALU 7, @102, KC0[], KC1[]
3163 ; EG-NEXT: TEX 0 @48
3164 ; EG-NEXT: ALU 7, @110, KC0[], KC1[]
3165 ; EG-NEXT: TEX 0 @50
3166 ; EG-NEXT: ALU 7, @118, KC0[], KC1[]
3167 ; EG-NEXT: TEX 0 @52
3168 ; EG-NEXT: ALU 7, @126, KC0[], KC1[]
3169 ; EG-NEXT: TEX 0 @54
3170 ; EG-NEXT: ALU 7, @134, KC0[], KC1[]
3171 ; EG-NEXT: TEX 0 @56
3172 ; EG-NEXT: ALU 7, @142, KC0[], KC1[]
3173 ; EG-NEXT: TEX 0 @58
3174 ; EG-NEXT: ALU 7, @150, KC0[], KC1[]
3175 ; EG-NEXT: TEX 0 @60
3176 ; EG-NEXT: ALU 5, @158, KC0[], KC1[]
3177 ; EG-NEXT: TEX 0 @62
3178 ; EG-NEXT: ALU 5, @164, KC0[], KC1[]
3179 ; EG-NEXT: TEX 0 @64
3180 ; EG-NEXT: ALU 5, @170, KC0[], KC1[]
3181 ; EG-NEXT: TEX 0 @66
3182 ; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
3183 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
3186 ; EG-NEXT: Fetch clause starting at 36:
3187 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
3188 ; EG-NEXT: Fetch clause starting at 38:
3189 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
3190 ; EG-NEXT: Fetch clause starting at 40:
3191 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
3192 ; EG-NEXT: Fetch clause starting at 42:
3193 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
3194 ; EG-NEXT: Fetch clause starting at 44:
3195 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
3196 ; EG-NEXT: Fetch clause starting at 46:
3197 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
3198 ; EG-NEXT: Fetch clause starting at 48:
3199 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
3200 ; EG-NEXT: Fetch clause starting at 50:
3201 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
3202 ; EG-NEXT: Fetch clause starting at 52:
3203 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
3204 ; EG-NEXT: Fetch clause starting at 54:
3205 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
3206 ; EG-NEXT: Fetch clause starting at 56:
3207 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
3208 ; EG-NEXT: Fetch clause starting at 58:
3209 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
3210 ; EG-NEXT: Fetch clause starting at 60:
3211 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
3212 ; EG-NEXT: Fetch clause starting at 62:
3213 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
3214 ; EG-NEXT: Fetch clause starting at 64:
3215 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
3216 ; EG-NEXT: Fetch clause starting at 66:
3217 ; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
3218 ; EG-NEXT: ALU clause starting at 68:
3219 ; EG-NEXT: MOV * T0.Y, T2.X,
3220 ; EG-NEXT: MOV * T7.X, 0.0,
3221 ; EG-NEXT: ALU clause starting at 70:
3222 ; EG-NEXT: LSHL T0.W, T8.X, literal.x,
3223 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3224 ; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
3225 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
3226 ; EG-NEXT: MOV T2.X, PV.W,
3227 ; EG-NEXT: MOV * T0.Y, T3.X,
3228 ; EG-NEXT: ALU clause starting at 76:
3229 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3230 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
3231 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3232 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3233 ; EG-NEXT: MOV T3.X, PV.W,
3234 ; EG-NEXT: MOV * T0.Y, T4.X,
3235 ; EG-NEXT: ALU clause starting at 82:
3236 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3237 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
3238 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3239 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3240 ; EG-NEXT: MOV T4.X, PV.W,
3241 ; EG-NEXT: MOV * T0.Y, T5.X,
3242 ; EG-NEXT: ALU clause starting at 88:
3243 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3244 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
3245 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3246 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3247 ; EG-NEXT: MOV T5.X, PV.W,
3248 ; EG-NEXT: MOV * T0.Y, T2.X,
3249 ; EG-NEXT: ALU clause starting at 94:
3250 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3251 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3252 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3253 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3254 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3255 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3256 ; EG-NEXT: MOV T2.X, PV.W,
3257 ; EG-NEXT: MOV * T0.Y, T3.X,
3258 ; EG-NEXT: ALU clause starting at 102:
3259 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3260 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3261 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3262 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3263 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3264 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3265 ; EG-NEXT: MOV T3.X, PV.W,
3266 ; EG-NEXT: MOV * T0.Y, T4.X,
3267 ; EG-NEXT: ALU clause starting at 110:
3268 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3269 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3270 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3271 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3272 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3273 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3274 ; EG-NEXT: MOV T4.X, PV.W,
3275 ; EG-NEXT: MOV * T0.Y, T5.X,
3276 ; EG-NEXT: ALU clause starting at 118:
3277 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3278 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3279 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3280 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3281 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3282 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3283 ; EG-NEXT: MOV T5.X, PV.W,
3284 ; EG-NEXT: MOV * T0.Y, T2.X,
3285 ; EG-NEXT: ALU clause starting at 126:
3286 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3287 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3288 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3289 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3290 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3291 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3292 ; EG-NEXT: MOV T2.X, PV.W,
3293 ; EG-NEXT: MOV * T0.Y, T3.X,
3294 ; EG-NEXT: ALU clause starting at 134:
3295 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3296 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3297 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3298 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3299 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3300 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3301 ; EG-NEXT: MOV T3.X, PV.W,
3302 ; EG-NEXT: MOV * T0.Y, T4.X,
3303 ; EG-NEXT: ALU clause starting at 142:
3304 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3305 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3306 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3307 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3308 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3309 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3310 ; EG-NEXT: MOV T4.X, PV.W,
3311 ; EG-NEXT: MOV * T0.Y, T5.X,
3312 ; EG-NEXT: ALU clause starting at 150:
3313 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3314 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3315 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3316 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3317 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3318 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3319 ; EG-NEXT: MOV T5.X, PV.W,
3320 ; EG-NEXT: MOV * T0.Y, T2.X,
3321 ; EG-NEXT: ALU clause starting at 158:
3322 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3323 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
3324 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3325 ; EG-NEXT: OR_INT * T7.W, PV.W, PS,
3326 ; EG-NEXT: MOV T2.X, PV.W,
3327 ; EG-NEXT: MOV * T0.Y, T3.X,
3328 ; EG-NEXT: ALU clause starting at 164:
3329 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3330 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
3331 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3332 ; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
3333 ; EG-NEXT: MOV T3.X, PV.Z,
3334 ; EG-NEXT: MOV * T0.Y, T4.X,
3335 ; EG-NEXT: ALU clause starting at 170:
3336 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3337 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
3338 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3339 ; EG-NEXT: OR_INT * T7.Y, PV.W, PS,
3340 ; EG-NEXT: MOV T4.X, PV.Y,
3341 ; EG-NEXT: MOV * T0.Y, T5.X,
3342 ; EG-NEXT: ALU clause starting at 176:
3343 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3344 ; EG-NEXT: AND_INT * T1.W, T7.X, literal.y,
3345 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3346 ; EG-NEXT: OR_INT T7.X, PV.W, PS,
3347 ; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
3348 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3350 ; CM-LABEL: v16i8_arg:
3351 ; CM: ; %bb.0: ; %entry
3352 ; CM-NEXT: ALU 1, @68, KC0[], KC1[]
3353 ; CM-NEXT: TEX 0 @36
3354 ; CM-NEXT: ALU 5, @70, KC0[], KC1[]
3355 ; CM-NEXT: TEX 0 @38
3356 ; CM-NEXT: ALU 5, @76, KC0[], KC1[]
3357 ; CM-NEXT: TEX 0 @40
3358 ; CM-NEXT: ALU 5, @82, KC0[], KC1[]
3359 ; CM-NEXT: TEX 0 @42
3360 ; CM-NEXT: ALU 5, @88, KC0[], KC1[]
3361 ; CM-NEXT: TEX 0 @44
3362 ; CM-NEXT: ALU 7, @94, KC0[], KC1[]
3363 ; CM-NEXT: TEX 0 @46
3364 ; CM-NEXT: ALU 7, @102, KC0[], KC1[]
3365 ; CM-NEXT: TEX 0 @48
3366 ; CM-NEXT: ALU 7, @110, KC0[], KC1[]
3367 ; CM-NEXT: TEX 0 @50
3368 ; CM-NEXT: ALU 7, @118, KC0[], KC1[]
3369 ; CM-NEXT: TEX 0 @52
3370 ; CM-NEXT: ALU 7, @126, KC0[], KC1[]
3371 ; CM-NEXT: TEX 0 @54
3372 ; CM-NEXT: ALU 7, @134, KC0[], KC1[]
3373 ; CM-NEXT: TEX 0 @56
3374 ; CM-NEXT: ALU 7, @142, KC0[], KC1[]
3375 ; CM-NEXT: TEX 0 @58
3376 ; CM-NEXT: ALU 7, @150, KC0[], KC1[]
3377 ; CM-NEXT: TEX 0 @60
3378 ; CM-NEXT: ALU 5, @158, KC0[], KC1[]
3379 ; CM-NEXT: TEX 0 @62
3380 ; CM-NEXT: ALU 5, @164, KC0[], KC1[]
3381 ; CM-NEXT: TEX 0 @64
3382 ; CM-NEXT: ALU 5, @170, KC0[], KC1[]
3383 ; CM-NEXT: TEX 0 @66
3384 ; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
3385 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
3388 ; CM-NEXT: Fetch clause starting at 36:
3389 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
3390 ; CM-NEXT: Fetch clause starting at 38:
3391 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
3392 ; CM-NEXT: Fetch clause starting at 40:
3393 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
3394 ; CM-NEXT: Fetch clause starting at 42:
3395 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
3396 ; CM-NEXT: Fetch clause starting at 44:
3397 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
3398 ; CM-NEXT: Fetch clause starting at 46:
3399 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
3400 ; CM-NEXT: Fetch clause starting at 48:
3401 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
3402 ; CM-NEXT: Fetch clause starting at 50:
3403 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
3404 ; CM-NEXT: Fetch clause starting at 52:
3405 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
3406 ; CM-NEXT: Fetch clause starting at 54:
3407 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
3408 ; CM-NEXT: Fetch clause starting at 56:
3409 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
3410 ; CM-NEXT: Fetch clause starting at 58:
3411 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
3412 ; CM-NEXT: Fetch clause starting at 60:
3413 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
3414 ; CM-NEXT: Fetch clause starting at 62:
3415 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
3416 ; CM-NEXT: Fetch clause starting at 64:
3417 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
3418 ; CM-NEXT: Fetch clause starting at 66:
3419 ; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
3420 ; CM-NEXT: ALU clause starting at 68:
3421 ; CM-NEXT: MOV * T0.Y, T2.X,
3422 ; CM-NEXT: MOV * T7.X, 0.0,
3423 ; CM-NEXT: ALU clause starting at 70:
3424 ; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
3425 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
3426 ; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
3427 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
3428 ; CM-NEXT: MOV T2.X, PV.W,
3429 ; CM-NEXT: MOV * T0.Y, T3.X,
3430 ; CM-NEXT: ALU clause starting at 76:
3431 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3432 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3433 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3434 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3435 ; CM-NEXT: MOV T3.X, PV.W,
3436 ; CM-NEXT: MOV * T0.Y, T4.X,
3437 ; CM-NEXT: ALU clause starting at 82:
3438 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3439 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3440 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3441 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3442 ; CM-NEXT: MOV T4.X, PV.W,
3443 ; CM-NEXT: MOV * T0.Y, T5.X,
3444 ; CM-NEXT: ALU clause starting at 88:
3445 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3446 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3447 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3448 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3449 ; CM-NEXT: MOV T5.X, PV.W,
3450 ; CM-NEXT: MOV * T0.Y, T2.X,
3451 ; CM-NEXT: ALU clause starting at 94:
3452 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3453 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3454 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3455 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3456 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3457 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3458 ; CM-NEXT: MOV T2.X, PV.W,
3459 ; CM-NEXT: MOV * T0.Y, T3.X,
3460 ; CM-NEXT: ALU clause starting at 102:
3461 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3462 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3463 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3464 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3465 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3466 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3467 ; CM-NEXT: MOV T3.X, PV.W,
3468 ; CM-NEXT: MOV * T0.Y, T4.X,
3469 ; CM-NEXT: ALU clause starting at 110:
3470 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3471 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3472 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3473 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3474 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3475 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3476 ; CM-NEXT: MOV T4.X, PV.W,
3477 ; CM-NEXT: MOV * T0.Y, T5.X,
3478 ; CM-NEXT: ALU clause starting at 118:
3479 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3480 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3481 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3482 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3483 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3484 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3485 ; CM-NEXT: MOV T5.X, PV.W,
3486 ; CM-NEXT: MOV * T0.Y, T2.X,
3487 ; CM-NEXT: ALU clause starting at 126:
3488 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3489 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3490 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3491 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3492 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3493 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3494 ; CM-NEXT: MOV T2.X, PV.W,
3495 ; CM-NEXT: MOV * T0.Y, T3.X,
3496 ; CM-NEXT: ALU clause starting at 134:
3497 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3498 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3499 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3500 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3501 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3502 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3503 ; CM-NEXT: MOV T3.X, PV.W,
3504 ; CM-NEXT: MOV * T0.Y, T4.X,
3505 ; CM-NEXT: ALU clause starting at 142:
3506 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3507 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3508 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3509 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3510 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3511 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3512 ; CM-NEXT: MOV T4.X, PV.W,
3513 ; CM-NEXT: MOV * T0.Y, T5.X,
3514 ; CM-NEXT: ALU clause starting at 150:
3515 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3516 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3517 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3518 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3519 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3520 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3521 ; CM-NEXT: MOV T5.X, PV.W,
3522 ; CM-NEXT: MOV * T0.Y, T2.X,
3523 ; CM-NEXT: ALU clause starting at 158:
3524 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3525 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3526 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3527 ; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W,
3528 ; CM-NEXT: MOV T2.X, PV.W,
3529 ; CM-NEXT: MOV * T0.Y, T3.X,
3530 ; CM-NEXT: ALU clause starting at 164:
3531 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3532 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3533 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3534 ; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
3535 ; CM-NEXT: MOV T3.X, PV.Z,
3536 ; CM-NEXT: MOV * T0.Y, T4.X,
3537 ; CM-NEXT: ALU clause starting at 170:
3538 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3539 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3540 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3541 ; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W,
3542 ; CM-NEXT: MOV T4.X, PV.Y,
3543 ; CM-NEXT: MOV * T0.Y, T5.X,
3544 ; CM-NEXT: ALU clause starting at 176:
3545 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3546 ; CM-NEXT: AND_INT * T0.W, T7.X, literal.y,
3547 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3548 ; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
3549 ; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
3550 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3552 store <16 x i8> %in, ptr addrspace(1) %out
3556 define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
3557 ; SI-LABEL: v16i16_arg:
3558 ; SI: ; %bb.0: ; %entry
3559 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
3560 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3561 ; SI-NEXT: s_mov_b32 s3, 0xf000
3562 ; SI-NEXT: s_mov_b32 s2, -1
3563 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3564 ; SI-NEXT: v_mov_b32_e32 v0, s12
3565 ; SI-NEXT: v_mov_b32_e32 v1, s13
3566 ; SI-NEXT: v_mov_b32_e32 v2, s14
3567 ; SI-NEXT: v_mov_b32_e32 v3, s15
3568 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3569 ; SI-NEXT: s_waitcnt expcnt(0)
3570 ; SI-NEXT: v_mov_b32_e32 v0, s8
3571 ; SI-NEXT: v_mov_b32_e32 v1, s9
3572 ; SI-NEXT: v_mov_b32_e32 v2, s10
3573 ; SI-NEXT: v_mov_b32_e32 v3, s11
3574 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3577 ; VI-LABEL: v16i16_arg:
3578 ; VI: ; %bb.0: ; %entry
3579 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
3580 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3581 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3582 ; VI-NEXT: v_mov_b32_e32 v0, s12
3583 ; VI-NEXT: s_add_u32 s2, s0, 16
3584 ; VI-NEXT: s_addc_u32 s3, s1, 0
3585 ; VI-NEXT: v_mov_b32_e32 v5, s3
3586 ; VI-NEXT: v_mov_b32_e32 v1, s13
3587 ; VI-NEXT: v_mov_b32_e32 v2, s14
3588 ; VI-NEXT: v_mov_b32_e32 v3, s15
3589 ; VI-NEXT: v_mov_b32_e32 v4, s2
3590 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3591 ; VI-NEXT: v_mov_b32_e32 v5, s1
3592 ; VI-NEXT: v_mov_b32_e32 v0, s8
3593 ; VI-NEXT: v_mov_b32_e32 v1, s9
3594 ; VI-NEXT: v_mov_b32_e32 v2, s10
3595 ; VI-NEXT: v_mov_b32_e32 v3, s11
3596 ; VI-NEXT: v_mov_b32_e32 v4, s0
3597 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3600 ; GFX9-LABEL: v16i16_arg:
3601 ; GFX9: ; %bb.0: ; %entry
3602 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
3603 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3604 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
3605 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3606 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3607 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3608 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
3609 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
3610 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
3611 ; GFX9-NEXT: s_nop 0
3612 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
3613 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
3614 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
3615 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3616 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
3617 ; GFX9-NEXT: s_endpgm
3619 ; EG-LABEL: v16i16_arg:
3620 ; EG: ; %bb.0: ; %entry
3621 ; EG-NEXT: ALU 1, @68, KC0[], KC1[]
3622 ; EG-NEXT: TEX 0 @36
3623 ; EG-NEXT: ALU 5, @70, KC0[], KC1[]
3624 ; EG-NEXT: TEX 0 @38
3625 ; EG-NEXT: ALU 5, @76, KC0[], KC1[]
3626 ; EG-NEXT: TEX 0 @40
3627 ; EG-NEXT: ALU 5, @82, KC0[], KC1[]
3628 ; EG-NEXT: TEX 0 @42
3629 ; EG-NEXT: ALU 5, @88, KC0[], KC1[]
3630 ; EG-NEXT: TEX 0 @44
3631 ; EG-NEXT: ALU 5, @94, KC0[], KC1[]
3632 ; EG-NEXT: TEX 0 @46
3633 ; EG-NEXT: ALU 5, @100, KC0[], KC1[]
3634 ; EG-NEXT: TEX 0 @48
3635 ; EG-NEXT: ALU 5, @106, KC0[], KC1[]
3636 ; EG-NEXT: TEX 0 @50
3637 ; EG-NEXT: ALU 5, @112, KC0[], KC1[]
3638 ; EG-NEXT: TEX 0 @52
3639 ; EG-NEXT: ALU 5, @118, KC0[], KC1[]
3640 ; EG-NEXT: TEX 0 @54
3641 ; EG-NEXT: ALU 5, @124, KC0[], KC1[]
3642 ; EG-NEXT: TEX 0 @56
3643 ; EG-NEXT: ALU 5, @130, KC0[], KC1[]
3644 ; EG-NEXT: TEX 0 @58
3645 ; EG-NEXT: ALU 5, @136, KC0[], KC1[]
3646 ; EG-NEXT: TEX 0 @60
3647 ; EG-NEXT: ALU 5, @142, KC0[], KC1[]
3648 ; EG-NEXT: TEX 0 @62
3649 ; EG-NEXT: ALU 5, @148, KC0[], KC1[]
3650 ; EG-NEXT: TEX 0 @64
3651 ; EG-NEXT: ALU 5, @154, KC0[], KC1[]
3652 ; EG-NEXT: TEX 0 @66
3653 ; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[]
3654 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
3655 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
3657 ; EG-NEXT: Fetch clause starting at 36:
3658 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
3659 ; EG-NEXT: Fetch clause starting at 38:
3660 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
3661 ; EG-NEXT: Fetch clause starting at 40:
3662 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
3663 ; EG-NEXT: Fetch clause starting at 42:
3664 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
3665 ; EG-NEXT: Fetch clause starting at 44:
3666 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
3667 ; EG-NEXT: Fetch clause starting at 46:
3668 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
3669 ; EG-NEXT: Fetch clause starting at 48:
3670 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
3671 ; EG-NEXT: Fetch clause starting at 50:
3672 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
3673 ; EG-NEXT: Fetch clause starting at 52:
3674 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
3675 ; EG-NEXT: Fetch clause starting at 54:
3676 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
3677 ; EG-NEXT: Fetch clause starting at 56:
3678 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
3679 ; EG-NEXT: Fetch clause starting at 58:
3680 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
3681 ; EG-NEXT: Fetch clause starting at 60:
3682 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
3683 ; EG-NEXT: Fetch clause starting at 62:
3684 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
3685 ; EG-NEXT: Fetch clause starting at 64:
3686 ; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
3687 ; EG-NEXT: Fetch clause starting at 66:
3688 ; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
3689 ; EG-NEXT: ALU clause starting at 68:
3690 ; EG-NEXT: MOV * T0.Y, T3.X,
3691 ; EG-NEXT: MOV * T11.X, 0.0,
3692 ; EG-NEXT: ALU clause starting at 70:
3693 ; EG-NEXT: LSHL T0.W, T12.X, literal.x,
3694 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3695 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
3696 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
3697 ; EG-NEXT: MOV T3.X, PV.W,
3698 ; EG-NEXT: MOV * T0.Y, T5.X,
3699 ; EG-NEXT: ALU clause starting at 76:
3700 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3701 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3702 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3703 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3704 ; EG-NEXT: MOV T5.X, PV.W,
3705 ; EG-NEXT: MOV * T0.Y, T7.X,
3706 ; EG-NEXT: ALU clause starting at 82:
3707 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3708 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3709 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3710 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3711 ; EG-NEXT: MOV T7.X, PV.W,
3712 ; EG-NEXT: MOV * T0.Y, T9.X,
3713 ; EG-NEXT: ALU clause starting at 88:
3714 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3715 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3716 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3717 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3718 ; EG-NEXT: MOV T9.X, PV.W,
3719 ; EG-NEXT: MOV * T0.Y, T3.X,
3720 ; EG-NEXT: ALU clause starting at 94:
3721 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3722 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3723 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3724 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3725 ; EG-NEXT: MOV T3.X, PV.W,
3726 ; EG-NEXT: MOV * T0.Y, T5.X,
3727 ; EG-NEXT: ALU clause starting at 100:
3728 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3729 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3730 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3731 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3732 ; EG-NEXT: MOV T5.X, PV.W,
3733 ; EG-NEXT: MOV * T0.Y, T7.X,
3734 ; EG-NEXT: ALU clause starting at 106:
3735 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3736 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3737 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3738 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3739 ; EG-NEXT: MOV T7.X, PV.W,
3740 ; EG-NEXT: MOV * T0.Y, T9.X,
3741 ; EG-NEXT: ALU clause starting at 112:
3742 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3743 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3744 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3745 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3746 ; EG-NEXT: MOV T9.X, PV.W,
3747 ; EG-NEXT: MOV * T0.Y, T2.X,
3748 ; EG-NEXT: ALU clause starting at 118:
3749 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3750 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3751 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3752 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3753 ; EG-NEXT: MOV T2.X, PV.W,
3754 ; EG-NEXT: MOV * T0.Y, T4.X,
3755 ; EG-NEXT: ALU clause starting at 124:
3756 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3757 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3758 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3759 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3760 ; EG-NEXT: MOV T4.X, PV.W,
3761 ; EG-NEXT: MOV * T0.Y, T6.X,
3762 ; EG-NEXT: ALU clause starting at 130:
3763 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3764 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3765 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3766 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3767 ; EG-NEXT: MOV T6.X, PV.W,
3768 ; EG-NEXT: MOV * T0.Y, T8.X,
3769 ; EG-NEXT: ALU clause starting at 136:
3770 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3771 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3772 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3773 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3774 ; EG-NEXT: MOV T8.X, PV.W,
3775 ; EG-NEXT: MOV * T0.Y, T2.X,
3776 ; EG-NEXT: ALU clause starting at 142:
3777 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3778 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3779 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3780 ; EG-NEXT: OR_INT * T12.Z, PV.W, PS,
3781 ; EG-NEXT: MOV T2.X, PV.Z,
3782 ; EG-NEXT: MOV * T0.Y, T4.X,
3783 ; EG-NEXT: ALU clause starting at 148:
3784 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3785 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3786 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3787 ; EG-NEXT: OR_INT * T12.X, PV.W, PS,
3788 ; EG-NEXT: MOV T4.X, PV.X,
3789 ; EG-NEXT: MOV * T0.Y, T6.X,
3790 ; EG-NEXT: ALU clause starting at 154:
3791 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3792 ; EG-NEXT: AND_INT * T1.W, T13.X, literal.y,
3793 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3794 ; EG-NEXT: OR_INT * T11.Z, PV.W, PS,
3795 ; EG-NEXT: MOV T6.X, PV.Z,
3796 ; EG-NEXT: MOV * T0.Y, T8.X,
3797 ; EG-NEXT: ALU clause starting at 160:
3798 ; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
3799 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3800 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
3801 ; EG-NEXT: LSHR T14.X, PV.W, literal.x,
3802 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
3803 ; EG-NEXT: AND_INT * T1.W, T11.X, literal.z,
3804 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
3805 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3806 ; EG-NEXT: OR_INT * T11.X, PV.W, PS,
3807 ; EG-NEXT: MOV T8.X, PV.X,
3808 ; EG-NEXT: MOV * T12.W, T3.X,
3809 ; EG-NEXT: MOV T12.Y, T5.X,
3810 ; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212
3811 ; EG-NEXT: MOV * T11.Y, T9.X,
3813 ; CM-LABEL: v16i16_arg:
3814 ; CM: ; %bb.0: ; %entry
3815 ; CM-NEXT: ALU 1, @68, KC0[], KC1[]
3816 ; CM-NEXT: TEX 0 @36
3817 ; CM-NEXT: ALU 5, @70, KC0[], KC1[]
3818 ; CM-NEXT: TEX 0 @38
3819 ; CM-NEXT: ALU 5, @76, KC0[], KC1[]
3820 ; CM-NEXT: TEX 0 @40
3821 ; CM-NEXT: ALU 5, @82, KC0[], KC1[]
3822 ; CM-NEXT: TEX 0 @42
3823 ; CM-NEXT: ALU 5, @88, KC0[], KC1[]
3824 ; CM-NEXT: TEX 0 @44
3825 ; CM-NEXT: ALU 5, @94, KC0[], KC1[]
3826 ; CM-NEXT: TEX 0 @46
3827 ; CM-NEXT: ALU 5, @100, KC0[], KC1[]
3828 ; CM-NEXT: TEX 0 @48
3829 ; CM-NEXT: ALU 5, @106, KC0[], KC1[]
3830 ; CM-NEXT: TEX 0 @50
3831 ; CM-NEXT: ALU 5, @112, KC0[], KC1[]
3832 ; CM-NEXT: TEX 0 @52
3833 ; CM-NEXT: ALU 5, @118, KC0[], KC1[]
3834 ; CM-NEXT: TEX 0 @54
3835 ; CM-NEXT: ALU 5, @124, KC0[], KC1[]
3836 ; CM-NEXT: TEX 0 @56
3837 ; CM-NEXT: ALU 5, @130, KC0[], KC1[]
3838 ; CM-NEXT: TEX 0 @58
3839 ; CM-NEXT: ALU 5, @136, KC0[], KC1[]
3840 ; CM-NEXT: TEX 0 @60
3841 ; CM-NEXT: ALU 5, @142, KC0[], KC1[]
3842 ; CM-NEXT: TEX 0 @62
3843 ; CM-NEXT: ALU 5, @148, KC0[], KC1[]
3844 ; CM-NEXT: TEX 0 @64
3845 ; CM-NEXT: ALU 5, @154, KC0[], KC1[]
3846 ; CM-NEXT: TEX 0 @66
3847 ; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[]
3848 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
3849 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
3851 ; CM-NEXT: Fetch clause starting at 36:
3852 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
3853 ; CM-NEXT: Fetch clause starting at 38:
3854 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
3855 ; CM-NEXT: Fetch clause starting at 40:
3856 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
3857 ; CM-NEXT: Fetch clause starting at 42:
3858 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
3859 ; CM-NEXT: Fetch clause starting at 44:
3860 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
3861 ; CM-NEXT: Fetch clause starting at 46:
3862 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
3863 ; CM-NEXT: Fetch clause starting at 48:
3864 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
3865 ; CM-NEXT: Fetch clause starting at 50:
3866 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
3867 ; CM-NEXT: Fetch clause starting at 52:
3868 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
3869 ; CM-NEXT: Fetch clause starting at 54:
3870 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
3871 ; CM-NEXT: Fetch clause starting at 56:
3872 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
3873 ; CM-NEXT: Fetch clause starting at 58:
3874 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
3875 ; CM-NEXT: Fetch clause starting at 60:
3876 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
3877 ; CM-NEXT: Fetch clause starting at 62:
3878 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
3879 ; CM-NEXT: Fetch clause starting at 64:
3880 ; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
3881 ; CM-NEXT: Fetch clause starting at 66:
3882 ; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
3883 ; CM-NEXT: ALU clause starting at 68:
3884 ; CM-NEXT: MOV * T0.Y, T3.X,
3885 ; CM-NEXT: MOV * T11.X, 0.0,
3886 ; CM-NEXT: ALU clause starting at 70:
3887 ; CM-NEXT: LSHL T0.Z, T12.X, literal.x,
3888 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
3889 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
3890 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
3891 ; CM-NEXT: MOV T3.X, PV.W,
3892 ; CM-NEXT: MOV * T0.Y, T5.X,
3893 ; CM-NEXT: ALU clause starting at 76:
3894 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3895 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
3896 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3897 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3898 ; CM-NEXT: MOV T5.X, PV.W,
3899 ; CM-NEXT: MOV * T0.Y, T7.X,
3900 ; CM-NEXT: ALU clause starting at 82:
3901 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3902 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
3903 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3904 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3905 ; CM-NEXT: MOV T7.X, PV.W,
3906 ; CM-NEXT: MOV * T0.Y, T9.X,
3907 ; CM-NEXT: ALU clause starting at 88:
3908 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3909 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
3910 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3911 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3912 ; CM-NEXT: MOV T9.X, PV.W,
3913 ; CM-NEXT: MOV * T0.Y, T3.X,
3914 ; CM-NEXT: ALU clause starting at 94:
3915 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3916 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
3917 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3918 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3919 ; CM-NEXT: MOV T3.X, PV.W,
3920 ; CM-NEXT: MOV * T0.Y, T5.X,
3921 ; CM-NEXT: ALU clause starting at 100:
3922 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3923 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
3924 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3925 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3926 ; CM-NEXT: MOV T5.X, PV.W,
3927 ; CM-NEXT: MOV * T0.Y, T7.X,
3928 ; CM-NEXT: ALU clause starting at 106:
3929 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3930 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
3931 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3932 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3933 ; CM-NEXT: MOV T7.X, PV.W,
3934 ; CM-NEXT: MOV * T0.Y, T9.X,
3935 ; CM-NEXT: ALU clause starting at 112:
3936 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3937 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
3938 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3939 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3940 ; CM-NEXT: MOV T9.X, PV.W,
3941 ; CM-NEXT: MOV * T0.Y, T2.X,
3942 ; CM-NEXT: ALU clause starting at 118:
3943 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3944 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
3945 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3946 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3947 ; CM-NEXT: MOV T2.X, PV.W,
3948 ; CM-NEXT: MOV * T0.Y, T4.X,
3949 ; CM-NEXT: ALU clause starting at 124:
3950 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3951 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
3952 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3953 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3954 ; CM-NEXT: MOV T4.X, PV.W,
3955 ; CM-NEXT: MOV * T0.Y, T6.X,
3956 ; CM-NEXT: ALU clause starting at 130:
3957 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3958 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
3959 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3960 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3961 ; CM-NEXT: MOV T6.X, PV.W,
3962 ; CM-NEXT: MOV * T0.Y, T8.X,
3963 ; CM-NEXT: ALU clause starting at 136:
3964 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3965 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
3966 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3967 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3968 ; CM-NEXT: MOV T8.X, PV.W,
3969 ; CM-NEXT: MOV * T0.Y, T2.X,
3970 ; CM-NEXT: ALU clause starting at 142:
3971 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3972 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
3973 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3974 ; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W,
3975 ; CM-NEXT: MOV T2.X, PV.Z,
3976 ; CM-NEXT: MOV * T0.Y, T4.X,
3977 ; CM-NEXT: ALU clause starting at 148:
3978 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3979 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
3980 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3981 ; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W,
3982 ; CM-NEXT: MOV T4.X, PV.X,
3983 ; CM-NEXT: MOV * T0.Y, T6.X,
3984 ; CM-NEXT: ALU clause starting at 154:
3985 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3986 ; CM-NEXT: AND_INT * T0.W, T13.X, literal.y,
3987 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3988 ; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W,
3989 ; CM-NEXT: MOV T6.X, PV.Z,
3990 ; CM-NEXT: MOV * T0.Y, T8.X,
3991 ; CM-NEXT: ALU clause starting at 160:
3992 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
3993 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3994 ; CM-NEXT: LSHR * T13.X, PV.W, literal.x,
3995 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3996 ; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
3997 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
3998 ; CM-NEXT: AND_INT * T0.W, T11.X, literal.z,
3999 ; CM-NEXT: 2(2.802597e-45), -65536(nan)
4000 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
4001 ; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W,
4002 ; CM-NEXT: MOV T8.X, PV.X,
4003 ; CM-NEXT: MOV * T12.W, T3.X,
4004 ; CM-NEXT: MOV T12.Y, T5.X,
4005 ; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212
4006 ; CM-NEXT: MOV * T11.Y, T9.X,
4008 store <16 x i16> %in, ptr addrspace(1) %out
4012 define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind {
4013 ; SI-LABEL: v16i32_arg:
4014 ; SI: ; %bb.0: ; %entry
4015 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
4016 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4017 ; SI-NEXT: s_mov_b32 s3, 0xf000
4018 ; SI-NEXT: s_mov_b32 s2, -1
4019 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4020 ; SI-NEXT: v_mov_b32_e32 v0, s20
4021 ; SI-NEXT: v_mov_b32_e32 v1, s21
4022 ; SI-NEXT: v_mov_b32_e32 v2, s22
4023 ; SI-NEXT: v_mov_b32_e32 v3, s23
4024 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4025 ; SI-NEXT: s_waitcnt expcnt(0)
4026 ; SI-NEXT: v_mov_b32_e32 v0, s16
4027 ; SI-NEXT: v_mov_b32_e32 v1, s17
4028 ; SI-NEXT: v_mov_b32_e32 v2, s18
4029 ; SI-NEXT: v_mov_b32_e32 v3, s19
4030 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4031 ; SI-NEXT: s_waitcnt expcnt(0)
4032 ; SI-NEXT: v_mov_b32_e32 v0, s12
4033 ; SI-NEXT: v_mov_b32_e32 v1, s13
4034 ; SI-NEXT: v_mov_b32_e32 v2, s14
4035 ; SI-NEXT: v_mov_b32_e32 v3, s15
4036 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4037 ; SI-NEXT: s_waitcnt expcnt(0)
4038 ; SI-NEXT: v_mov_b32_e32 v0, s8
4039 ; SI-NEXT: v_mov_b32_e32 v1, s9
4040 ; SI-NEXT: v_mov_b32_e32 v2, s10
4041 ; SI-NEXT: v_mov_b32_e32 v3, s11
4042 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4045 ; VI-LABEL: v16i32_arg:
4046 ; VI: ; %bb.0: ; %entry
4047 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
4048 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
4049 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4050 ; VI-NEXT: v_mov_b32_e32 v0, s20
4051 ; VI-NEXT: s_add_u32 s2, s0, 48
4052 ; VI-NEXT: s_addc_u32 s3, s1, 0
4053 ; VI-NEXT: v_mov_b32_e32 v5, s3
4054 ; VI-NEXT: v_mov_b32_e32 v4, s2
4055 ; VI-NEXT: s_add_u32 s2, s0, 32
4056 ; VI-NEXT: v_mov_b32_e32 v1, s21
4057 ; VI-NEXT: v_mov_b32_e32 v2, s22
4058 ; VI-NEXT: v_mov_b32_e32 v3, s23
4059 ; VI-NEXT: s_addc_u32 s3, s1, 0
4060 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4061 ; VI-NEXT: v_mov_b32_e32 v5, s3
4062 ; VI-NEXT: v_mov_b32_e32 v4, s2
4063 ; VI-NEXT: s_add_u32 s2, s0, 16
4064 ; VI-NEXT: v_mov_b32_e32 v0, s16
4065 ; VI-NEXT: v_mov_b32_e32 v1, s17
4066 ; VI-NEXT: v_mov_b32_e32 v2, s18
4067 ; VI-NEXT: v_mov_b32_e32 v3, s19
4068 ; VI-NEXT: s_addc_u32 s3, s1, 0
4069 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4070 ; VI-NEXT: v_mov_b32_e32 v5, s3
4071 ; VI-NEXT: v_mov_b32_e32 v0, s12
4072 ; VI-NEXT: v_mov_b32_e32 v1, s13
4073 ; VI-NEXT: v_mov_b32_e32 v2, s14
4074 ; VI-NEXT: v_mov_b32_e32 v3, s15
4075 ; VI-NEXT: v_mov_b32_e32 v4, s2
4076 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4077 ; VI-NEXT: v_mov_b32_e32 v5, s1
4078 ; VI-NEXT: v_mov_b32_e32 v0, s8
4079 ; VI-NEXT: v_mov_b32_e32 v1, s9
4080 ; VI-NEXT: v_mov_b32_e32 v2, s10
4081 ; VI-NEXT: v_mov_b32_e32 v3, s11
4082 ; VI-NEXT: v_mov_b32_e32 v4, s0
4083 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4086 ; GFX9-LABEL: v16i32_arg:
4087 ; GFX9: ; %bb.0: ; %entry
4088 ; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
4089 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
4090 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4091 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4092 ; GFX9-NEXT: v_mov_b32_e32 v0, s24
4093 ; GFX9-NEXT: v_mov_b32_e32 v1, s25
4094 ; GFX9-NEXT: v_mov_b32_e32 v2, s26
4095 ; GFX9-NEXT: v_mov_b32_e32 v3, s27
4096 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4097 ; GFX9-NEXT: s_nop 0
4098 ; GFX9-NEXT: v_mov_b32_e32 v0, s20
4099 ; GFX9-NEXT: v_mov_b32_e32 v1, s21
4100 ; GFX9-NEXT: v_mov_b32_e32 v2, s22
4101 ; GFX9-NEXT: v_mov_b32_e32 v3, s23
4102 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4103 ; GFX9-NEXT: s_nop 0
4104 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
4105 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
4106 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
4107 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
4108 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4109 ; GFX9-NEXT: s_nop 0
4110 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
4111 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
4112 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
4113 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
4114 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
4115 ; GFX9-NEXT: s_endpgm
4117 ; EG-LABEL: v16i32_arg:
4118 ; EG: ; %bb.0: ; %entry
4119 ; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[]
4120 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4121 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4122 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4123 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4125 ; EG-NEXT: ALU clause starting at 6:
4126 ; EG-NEXT: MOV * T0.W, KC0[7].X,
4127 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
4128 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
4129 ; EG-NEXT: MOV * T1.W, KC0[8].X,
4130 ; EG-NEXT: MOV T0.X, KC0[6].Y,
4131 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
4132 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
4133 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
4134 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4135 ; EG-NEXT: MOV * T3.W, KC0[9].X,
4136 ; EG-NEXT: MOV T1.X, KC0[7].Y,
4137 ; EG-NEXT: MOV * T3.Z, KC0[8].W,
4138 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4139 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4140 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
4141 ; EG-NEXT: MOV T3.Y, KC0[8].Z,
4142 ; EG-NEXT: MOV * T5.W, KC0[10].X,
4143 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4144 ; EG-NEXT: MOV T3.X, KC0[8].Y,
4145 ; EG-NEXT: MOV * T5.Z, KC0[9].W,
4146 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4147 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4148 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
4149 ; EG-NEXT: MOV T5.Y, KC0[9].Z,
4150 ; EG-NEXT: MOV * T5.X, KC0[9].Y,
4151 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4152 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4153 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4154 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4155 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4157 ; CM-LABEL: v16i32_arg:
4158 ; CM: ; %bb.0: ; %entry
4159 ; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[]
4160 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4161 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4162 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4163 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4165 ; CM-NEXT: ALU clause starting at 6:
4166 ; CM-NEXT: MOV * T0.W, KC0[10].X,
4167 ; CM-NEXT: MOV * T0.Z, KC0[9].W,
4168 ; CM-NEXT: MOV * T0.Y, KC0[9].Z,
4169 ; CM-NEXT: MOV T0.X, KC0[9].Y,
4170 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
4171 ; CM-NEXT: MOV * T2.W, KC0[9].X,
4172 ; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4173 ; CM-NEXT: MOV T2.Z, KC0[8].W,
4174 ; CM-NEXT: MOV * T1.W, KC0[8].X,
4175 ; CM-NEXT: LSHR T3.X, T1.Z, literal.x,
4176 ; CM-NEXT: MOV T2.Y, KC0[8].Z,
4177 ; CM-NEXT: MOV * T1.Z, KC0[7].W,
4178 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4179 ; CM-NEXT: MOV T2.X, KC0[8].Y,
4180 ; CM-NEXT: MOV * T1.Y, KC0[7].Z,
4181 ; CM-NEXT: MOV T1.X, KC0[7].Y,
4182 ; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x,
4183 ; CM-NEXT: MOV * T4.W, KC0[7].X,
4184 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4185 ; CM-NEXT: LSHR T5.X, PV.Z, literal.x,
4186 ; CM-NEXT: MOV T4.Z, KC0[6].W,
4187 ; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
4188 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
4189 ; CM-NEXT: LSHR T6.X, PV.W, literal.x,
4190 ; CM-NEXT: MOV * T4.Y, KC0[6].Z,
4191 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4192 ; CM-NEXT: MOV * T4.X, KC0[6].Y,
4193 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
4194 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4196 store <16 x i32> %in, ptr addrspace(1) %out, align 4
4200 define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind {
4201 ; SI-LABEL: v16f32_arg:
4202 ; SI: ; %bb.0: ; %entry
4203 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
4204 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4205 ; SI-NEXT: s_mov_b32 s3, 0xf000
4206 ; SI-NEXT: s_mov_b32 s2, -1
4207 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4208 ; SI-NEXT: v_mov_b32_e32 v0, s20
4209 ; SI-NEXT: v_mov_b32_e32 v1, s21
4210 ; SI-NEXT: v_mov_b32_e32 v2, s22
4211 ; SI-NEXT: v_mov_b32_e32 v3, s23
4212 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4213 ; SI-NEXT: s_waitcnt expcnt(0)
4214 ; SI-NEXT: v_mov_b32_e32 v0, s16
4215 ; SI-NEXT: v_mov_b32_e32 v1, s17
4216 ; SI-NEXT: v_mov_b32_e32 v2, s18
4217 ; SI-NEXT: v_mov_b32_e32 v3, s19
4218 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4219 ; SI-NEXT: s_waitcnt expcnt(0)
4220 ; SI-NEXT: v_mov_b32_e32 v0, s12
4221 ; SI-NEXT: v_mov_b32_e32 v1, s13
4222 ; SI-NEXT: v_mov_b32_e32 v2, s14
4223 ; SI-NEXT: v_mov_b32_e32 v3, s15
4224 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4225 ; SI-NEXT: s_waitcnt expcnt(0)
4226 ; SI-NEXT: v_mov_b32_e32 v0, s8
4227 ; SI-NEXT: v_mov_b32_e32 v1, s9
4228 ; SI-NEXT: v_mov_b32_e32 v2, s10
4229 ; SI-NEXT: v_mov_b32_e32 v3, s11
4230 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4233 ; VI-LABEL: v16f32_arg:
4234 ; VI: ; %bb.0: ; %entry
4235 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
4236 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
4237 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4238 ; VI-NEXT: v_mov_b32_e32 v0, s20
4239 ; VI-NEXT: s_add_u32 s2, s0, 48
4240 ; VI-NEXT: s_addc_u32 s3, s1, 0
4241 ; VI-NEXT: v_mov_b32_e32 v5, s3
4242 ; VI-NEXT: v_mov_b32_e32 v4, s2
4243 ; VI-NEXT: s_add_u32 s2, s0, 32
4244 ; VI-NEXT: v_mov_b32_e32 v1, s21
4245 ; VI-NEXT: v_mov_b32_e32 v2, s22
4246 ; VI-NEXT: v_mov_b32_e32 v3, s23
4247 ; VI-NEXT: s_addc_u32 s3, s1, 0
4248 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4249 ; VI-NEXT: v_mov_b32_e32 v5, s3
4250 ; VI-NEXT: v_mov_b32_e32 v4, s2
4251 ; VI-NEXT: s_add_u32 s2, s0, 16
4252 ; VI-NEXT: v_mov_b32_e32 v0, s16
4253 ; VI-NEXT: v_mov_b32_e32 v1, s17
4254 ; VI-NEXT: v_mov_b32_e32 v2, s18
4255 ; VI-NEXT: v_mov_b32_e32 v3, s19
4256 ; VI-NEXT: s_addc_u32 s3, s1, 0
4257 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4258 ; VI-NEXT: v_mov_b32_e32 v5, s3
4259 ; VI-NEXT: v_mov_b32_e32 v0, s12
4260 ; VI-NEXT: v_mov_b32_e32 v1, s13
4261 ; VI-NEXT: v_mov_b32_e32 v2, s14
4262 ; VI-NEXT: v_mov_b32_e32 v3, s15
4263 ; VI-NEXT: v_mov_b32_e32 v4, s2
4264 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4265 ; VI-NEXT: v_mov_b32_e32 v5, s1
4266 ; VI-NEXT: v_mov_b32_e32 v0, s8
4267 ; VI-NEXT: v_mov_b32_e32 v1, s9
4268 ; VI-NEXT: v_mov_b32_e32 v2, s10
4269 ; VI-NEXT: v_mov_b32_e32 v3, s11
4270 ; VI-NEXT: v_mov_b32_e32 v4, s0
4271 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4274 ; GFX9-LABEL: v16f32_arg:
4275 ; GFX9: ; %bb.0: ; %entry
4276 ; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
4277 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
4278 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4279 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4280 ; GFX9-NEXT: v_mov_b32_e32 v0, s24
4281 ; GFX9-NEXT: v_mov_b32_e32 v1, s25
4282 ; GFX9-NEXT: v_mov_b32_e32 v2, s26
4283 ; GFX9-NEXT: v_mov_b32_e32 v3, s27
4284 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4285 ; GFX9-NEXT: s_nop 0
4286 ; GFX9-NEXT: v_mov_b32_e32 v0, s20
4287 ; GFX9-NEXT: v_mov_b32_e32 v1, s21
4288 ; GFX9-NEXT: v_mov_b32_e32 v2, s22
4289 ; GFX9-NEXT: v_mov_b32_e32 v3, s23
4290 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4291 ; GFX9-NEXT: s_nop 0
4292 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
4293 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
4294 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
4295 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
4296 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4297 ; GFX9-NEXT: s_nop 0
4298 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
4299 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
4300 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
4301 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
4302 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
4303 ; GFX9-NEXT: s_endpgm
4305 ; EG-LABEL: v16f32_arg:
4306 ; EG: ; %bb.0: ; %entry
4307 ; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[]
4308 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4309 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4310 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4311 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4313 ; EG-NEXT: ALU clause starting at 6:
4314 ; EG-NEXT: MOV * T0.W, KC0[7].X,
4315 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
4316 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
4317 ; EG-NEXT: MOV * T1.W, KC0[8].X,
4318 ; EG-NEXT: MOV T0.X, KC0[6].Y,
4319 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
4320 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
4321 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
4322 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4323 ; EG-NEXT: MOV * T3.W, KC0[9].X,
4324 ; EG-NEXT: MOV T1.X, KC0[7].Y,
4325 ; EG-NEXT: MOV * T3.Z, KC0[8].W,
4326 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4327 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4328 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
4329 ; EG-NEXT: MOV T3.Y, KC0[8].Z,
4330 ; EG-NEXT: MOV * T5.W, KC0[10].X,
4331 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4332 ; EG-NEXT: MOV T3.X, KC0[8].Y,
4333 ; EG-NEXT: MOV * T5.Z, KC0[9].W,
4334 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4335 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4336 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
4337 ; EG-NEXT: MOV T5.Y, KC0[9].Z,
4338 ; EG-NEXT: MOV * T5.X, KC0[9].Y,
4339 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4340 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4341 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4342 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4343 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4345 ; CM-LABEL: v16f32_arg:
4346 ; CM: ; %bb.0: ; %entry
4347 ; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[]
4348 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4349 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4350 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4351 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4353 ; CM-NEXT: ALU clause starting at 6:
4354 ; CM-NEXT: MOV * T0.W, KC0[10].X,
4355 ; CM-NEXT: MOV * T0.Z, KC0[9].W,
4356 ; CM-NEXT: MOV * T0.Y, KC0[9].Z,
4357 ; CM-NEXT: MOV T0.X, KC0[9].Y,
4358 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
4359 ; CM-NEXT: MOV * T2.W, KC0[9].X,
4360 ; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4361 ; CM-NEXT: MOV T2.Z, KC0[8].W,
4362 ; CM-NEXT: MOV * T1.W, KC0[8].X,
4363 ; CM-NEXT: LSHR T3.X, T1.Z, literal.x,
4364 ; CM-NEXT: MOV T2.Y, KC0[8].Z,
4365 ; CM-NEXT: MOV * T1.Z, KC0[7].W,
4366 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4367 ; CM-NEXT: MOV T2.X, KC0[8].Y,
4368 ; CM-NEXT: MOV * T1.Y, KC0[7].Z,
4369 ; CM-NEXT: MOV T1.X, KC0[7].Y,
4370 ; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x,
4371 ; CM-NEXT: MOV * T4.W, KC0[7].X,
4372 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4373 ; CM-NEXT: LSHR T5.X, PV.Z, literal.x,
4374 ; CM-NEXT: MOV T4.Z, KC0[6].W,
4375 ; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
4376 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
4377 ; CM-NEXT: LSHR T6.X, PV.W, literal.x,
4378 ; CM-NEXT: MOV * T4.Y, KC0[6].Z,
4379 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4380 ; CM-NEXT: MOV * T4.X, KC0[6].Y,
4381 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
4382 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4384 store <16 x float> %in, ptr addrspace(1) %out, align 4
4388 define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind {
4389 ; SI-LABEL: kernel_arg_i64:
4391 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4392 ; SI-NEXT: s_mov_b32 s7, 0xf000
4393 ; SI-NEXT: s_mov_b32 s6, -1
4394 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4395 ; SI-NEXT: s_mov_b32 s4, s0
4396 ; SI-NEXT: s_mov_b32 s5, s1
4397 ; SI-NEXT: v_mov_b32_e32 v0, s2
4398 ; SI-NEXT: v_mov_b32_e32 v1, s3
4399 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4402 ; VI-LABEL: kernel_arg_i64:
4404 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4405 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4406 ; VI-NEXT: v_mov_b32_e32 v0, s0
4407 ; VI-NEXT: v_mov_b32_e32 v1, s1
4408 ; VI-NEXT: v_mov_b32_e32 v2, s2
4409 ; VI-NEXT: v_mov_b32_e32 v3, s3
4410 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
4413 ; GFX9-LABEL: kernel_arg_i64:
4415 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4416 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4417 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4418 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4419 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4420 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
4421 ; GFX9-NEXT: s_endpgm
4423 ; EG-LABEL: kernel_arg_i64:
4425 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4426 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4429 ; EG-NEXT: ALU clause starting at 4:
4430 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
4431 ; EG-NEXT: MOV T0.X, KC0[2].W,
4432 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4433 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4435 ; CM-LABEL: kernel_arg_i64:
4437 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4438 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4441 ; CM-NEXT: ALU clause starting at 4:
4442 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
4443 ; CM-NEXT: MOV * T0.X, KC0[2].W,
4444 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4445 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4446 store i64 %a, ptr addrspace(1) %out, align 8
4450 define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
4451 ; SI-LABEL: f64_kernel_arg:
4452 ; SI: ; %bb.0: ; %entry
4453 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4454 ; SI-NEXT: s_mov_b32 s7, 0xf000
4455 ; SI-NEXT: s_mov_b32 s6, -1
4456 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4457 ; SI-NEXT: s_mov_b32 s4, s0
4458 ; SI-NEXT: s_mov_b32 s5, s1
4459 ; SI-NEXT: v_mov_b32_e32 v0, s2
4460 ; SI-NEXT: v_mov_b32_e32 v1, s3
4461 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4464 ; VI-LABEL: f64_kernel_arg:
4465 ; VI: ; %bb.0: ; %entry
4466 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4467 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4468 ; VI-NEXT: v_mov_b32_e32 v0, s0
4469 ; VI-NEXT: v_mov_b32_e32 v1, s1
4470 ; VI-NEXT: v_mov_b32_e32 v2, s2
4471 ; VI-NEXT: v_mov_b32_e32 v3, s3
4472 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
4475 ; GFX9-LABEL: f64_kernel_arg:
4476 ; GFX9: ; %bb.0: ; %entry
4477 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4478 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4479 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4480 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4481 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4482 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
4483 ; GFX9-NEXT: s_endpgm
4485 ; EG-LABEL: f64_kernel_arg:
4486 ; EG: ; %bb.0: ; %entry
4487 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4488 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4491 ; EG-NEXT: ALU clause starting at 4:
4492 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
4493 ; EG-NEXT: MOV T0.X, KC0[2].W,
4494 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4495 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4497 ; CM-LABEL: f64_kernel_arg:
4498 ; CM: ; %bb.0: ; %entry
4499 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4500 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4503 ; CM-NEXT: ALU clause starting at 4:
4504 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
4505 ; CM-NEXT: MOV * T0.X, KC0[2].W,
4506 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4507 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4509 store double %in, ptr addrspace(1) %out
4513 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
4514 ; XGCN: s_load_dwordx2
4515 ; XGCN: s_load_dwordx2
4516 ; XGCN: buffer_store_dwordx2
4517 ; define amdgpu_kernel void @kernel_arg_v1i64(ptr addrspace(1) %out, <1 x i64> %a) nounwind {
4518 ; store <1 x i64> %a, ptr addrspace(1) %out, align 8
4522 define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
4523 ; SI-LABEL: i65_arg:
4524 ; SI: ; %bb.0: ; %entry
4525 ; SI-NEXT: s_load_dword s6, s[4:5], 0xd
4526 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4527 ; SI-NEXT: s_mov_b32 s7, 0xf000
4528 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4529 ; SI-NEXT: s_and_b32 s8, s6, 1
4530 ; SI-NEXT: s_mov_b32 s6, -1
4531 ; SI-NEXT: s_mov_b32 s4, s0
4532 ; SI-NEXT: s_mov_b32 s5, s1
4533 ; SI-NEXT: v_mov_b32_e32 v0, s2
4534 ; SI-NEXT: v_mov_b32_e32 v1, s3
4535 ; SI-NEXT: v_mov_b32_e32 v2, s8
4536 ; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:8
4537 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4540 ; VI-LABEL: i65_arg:
4541 ; VI: ; %bb.0: ; %entry
4542 ; VI-NEXT: s_load_dword s6, s[4:5], 0x34
4543 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4544 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4545 ; VI-NEXT: s_and_b32 s4, s6, 1
4546 ; VI-NEXT: v_mov_b32_e32 v0, s0
4547 ; VI-NEXT: v_mov_b32_e32 v1, s1
4548 ; VI-NEXT: s_add_u32 s0, s0, 8
4549 ; VI-NEXT: s_addc_u32 s1, s1, 0
4550 ; VI-NEXT: v_mov_b32_e32 v5, s1
4551 ; VI-NEXT: v_mov_b32_e32 v2, s2
4552 ; VI-NEXT: v_mov_b32_e32 v6, s4
4553 ; VI-NEXT: v_mov_b32_e32 v4, s0
4554 ; VI-NEXT: v_mov_b32_e32 v3, s3
4555 ; VI-NEXT: flat_store_byte v[4:5], v6
4556 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
4559 ; GFX9-LABEL: i65_arg:
4560 ; GFX9: ; %bb.0: ; %entry
4561 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
4562 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
4563 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4564 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4565 ; GFX9-NEXT: s_and_b32 s4, s4, 1
4566 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4567 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
4568 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4569 ; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8
4570 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
4571 ; GFX9-NEXT: s_endpgm
4573 ; EG-LABEL: i65_arg:
4574 ; EG: ; %bb.0: ; %entry
4575 ; EG-NEXT: ALU 20, @6, KC0[CB0:0-32], KC1[]
4576 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
4577 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
4578 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
4581 ; EG-NEXT: ALU clause starting at 6:
4582 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4583 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
4584 ; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
4585 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4586 ; EG-NEXT: LSHL T1.W, PV.W, literal.x,
4587 ; EG-NEXT: AND_INT * T2.W, KC0[3].Y, 1,
4588 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4589 ; EG-NEXT: LSHL T1.X, PS, PV.W,
4590 ; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
4591 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4592 ; EG-NEXT: MOV T1.Y, 0.0,
4593 ; EG-NEXT: MOV * T1.Z, 0.0,
4594 ; EG-NEXT: LSHR T0.X, T0.W, literal.x,
4595 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4596 ; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
4597 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
4598 ; EG-NEXT: MOV * T3.X, KC0[3].X,
4599 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4600 ; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
4601 ; EG-NEXT: MOV * T5.X, KC0[2].W,
4602 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4604 ; CM-LABEL: i65_arg:
4605 ; CM: ; %bb.0: ; %entry
4606 ; CM-NEXT: ALU 21, @6, KC0[CB0:0-32], KC1[]
4607 ; CM-NEXT: MEM_RAT MSKOR T1.XW, T5.X
4608 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
4609 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
4612 ; CM-NEXT: ALU clause starting at 6:
4613 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4614 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
4615 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
4616 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4617 ; CM-NEXT: LSHL T0.Z, PV.W, literal.x,
4618 ; CM-NEXT: AND_INT * T1.W, KC0[3].Y, 1,
4619 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4620 ; CM-NEXT: LSHL T1.X, PV.W, PV.Z,
4621 ; CM-NEXT: LSHL * T1.W, literal.x, PV.Z,
4622 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4623 ; CM-NEXT: MOV T1.Y, 0.0,
4624 ; CM-NEXT: MOV * T1.Z, 0.0,
4625 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
4626 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4627 ; CM-NEXT: MOV T2.X, KC0[2].W,
4628 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4629 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
4630 ; CM-NEXT: LSHR * T3.X, PV.W, literal.x,
4631 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4632 ; CM-NEXT: MOV * T4.X, KC0[3].X,
4633 ; CM-NEXT: LSHR * T5.X, T0.W, literal.x,
4634 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4636 store i65 %in, ptr addrspace(1) %out, align 4
4640 define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
4643 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
4644 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4645 ; SI-NEXT: s_mov_b32 s3, 0xf000
4646 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4647 ; SI-NEXT: s_and_b32 s4, s2, 1
4648 ; SI-NEXT: s_mov_b32 s2, -1
4649 ; SI-NEXT: v_mov_b32_e32 v0, s4
4650 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
4655 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
4656 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
4657 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4658 ; VI-NEXT: s_and_b32 s2, s2, 1
4659 ; VI-NEXT: v_mov_b32_e32 v0, s0
4660 ; VI-NEXT: v_mov_b32_e32 v1, s1
4661 ; VI-NEXT: v_mov_b32_e32 v2, s2
4662 ; VI-NEXT: flat_store_byte v[0:1], v2
4665 ; GFX9-LABEL: i1_arg:
4667 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
4668 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
4669 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4670 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4671 ; GFX9-NEXT: s_and_b32 s2, s2, 1
4672 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
4673 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
4674 ; GFX9-NEXT: s_endpgm
4678 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
4680 ; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
4681 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
4684 ; EG-NEXT: Fetch clause starting at 6:
4685 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4686 ; EG-NEXT: ALU clause starting at 8:
4687 ; EG-NEXT: MOV * T0.X, 0.0,
4688 ; EG-NEXT: ALU clause starting at 9:
4689 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
4690 ; EG-NEXT: AND_INT * T1.W, T0.X, 1,
4691 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4692 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
4693 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4694 ; EG-NEXT: LSHL T0.X, T1.W, PV.W,
4695 ; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
4696 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4697 ; EG-NEXT: MOV T0.Y, 0.0,
4698 ; EG-NEXT: MOV * T0.Z, 0.0,
4699 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4700 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4704 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
4706 ; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
4707 ; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
4710 ; CM-NEXT: Fetch clause starting at 6:
4711 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4712 ; CM-NEXT: ALU clause starting at 8:
4713 ; CM-NEXT: MOV * T0.X, 0.0,
4714 ; CM-NEXT: ALU clause starting at 9:
4715 ; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
4716 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4717 ; CM-NEXT: AND_INT T0.Z, T0.X, 1,
4718 ; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
4719 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4720 ; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
4721 ; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
4722 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4723 ; CM-NEXT: MOV T0.Y, 0.0,
4724 ; CM-NEXT: MOV * T0.Z, 0.0,
4725 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4726 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4727 store i1 %x, ptr addrspace(1) %out, align 1
4731 define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
4732 ; SI-LABEL: i1_arg_zext_i32:
4734 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
4735 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4736 ; SI-NEXT: s_mov_b32 s3, 0xf000
4737 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4738 ; SI-NEXT: s_and_b32 s4, s2, 1
4739 ; SI-NEXT: s_mov_b32 s2, -1
4740 ; SI-NEXT: v_mov_b32_e32 v0, s4
4741 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4744 ; VI-LABEL: i1_arg_zext_i32:
4746 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
4747 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
4748 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4749 ; VI-NEXT: s_and_b32 s2, s2, 1
4750 ; VI-NEXT: v_mov_b32_e32 v0, s0
4751 ; VI-NEXT: v_mov_b32_e32 v1, s1
4752 ; VI-NEXT: v_mov_b32_e32 v2, s2
4753 ; VI-NEXT: flat_store_dword v[0:1], v2
4756 ; GFX9-LABEL: i1_arg_zext_i32:
4758 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
4759 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
4760 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4761 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4762 ; GFX9-NEXT: s_and_b32 s2, s2, 1
4763 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
4764 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4765 ; GFX9-NEXT: s_endpgm
4767 ; EG-LABEL: i1_arg_zext_i32:
4769 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
4771 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
4772 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
4775 ; EG-NEXT: Fetch clause starting at 6:
4776 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4777 ; EG-NEXT: ALU clause starting at 8:
4778 ; EG-NEXT: MOV * T0.X, 0.0,
4779 ; EG-NEXT: ALU clause starting at 9:
4780 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4781 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4783 ; CM-LABEL: i1_arg_zext_i32:
4785 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
4787 ; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
4788 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
4791 ; CM-NEXT: Fetch clause starting at 6:
4792 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4793 ; CM-NEXT: ALU clause starting at 8:
4794 ; CM-NEXT: MOV * T0.X, 0.0,
4795 ; CM-NEXT: ALU clause starting at 9:
4796 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4797 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4798 %ext = zext i1 %x to i32
4799 store i32 %ext, ptr addrspace(1) %out, align 4
4803 define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
4804 ; SI-LABEL: i1_arg_zext_i64:
4806 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
4807 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4808 ; SI-NEXT: s_mov_b32 s3, 0xf000
4809 ; SI-NEXT: s_mov_b32 s2, -1
4810 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4811 ; SI-NEXT: s_and_b32 s4, s6, 1
4812 ; SI-NEXT: v_mov_b32_e32 v1, 0
4813 ; SI-NEXT: v_mov_b32_e32 v0, s4
4814 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4817 ; VI-LABEL: i1_arg_zext_i64:
4819 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
4820 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
4821 ; VI-NEXT: v_mov_b32_e32 v1, 0
4822 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4823 ; VI-NEXT: s_and_b32 s2, s2, 1
4824 ; VI-NEXT: v_mov_b32_e32 v3, s1
4825 ; VI-NEXT: v_mov_b32_e32 v0, s2
4826 ; VI-NEXT: v_mov_b32_e32 v2, s0
4827 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
4830 ; GFX9-LABEL: i1_arg_zext_i64:
4832 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
4833 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
4834 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4835 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4836 ; GFX9-NEXT: s_and_b32 s2, s2, 1
4837 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4838 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
4839 ; GFX9-NEXT: s_endpgm
4841 ; EG-LABEL: i1_arg_zext_i64:
4843 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
4845 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
4846 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4849 ; EG-NEXT: Fetch clause starting at 6:
4850 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4851 ; EG-NEXT: ALU clause starting at 8:
4852 ; EG-NEXT: MOV * T0.X, 0.0,
4853 ; EG-NEXT: ALU clause starting at 9:
4854 ; EG-NEXT: MOV * T0.Y, 0.0,
4855 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4856 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4858 ; CM-LABEL: i1_arg_zext_i64:
4860 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
4862 ; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
4863 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4866 ; CM-NEXT: Fetch clause starting at 6:
4867 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4868 ; CM-NEXT: ALU clause starting at 8:
4869 ; CM-NEXT: MOV * T0.X, 0.0,
4870 ; CM-NEXT: ALU clause starting at 9:
4871 ; CM-NEXT: MOV * T0.Y, 0.0,
4872 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4873 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4874 %ext = zext i1 %x to i64
4875 store i64 %ext, ptr addrspace(1) %out, align 8
4879 define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
4880 ; SI-LABEL: i1_arg_sext_i32:
4882 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
4883 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4884 ; SI-NEXT: s_mov_b32 s3, 0xf000
4885 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4886 ; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
4887 ; SI-NEXT: s_mov_b32 s2, -1
4888 ; SI-NEXT: v_mov_b32_e32 v0, s4
4889 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4892 ; VI-LABEL: i1_arg_sext_i32:
4894 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
4895 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
4896 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4897 ; VI-NEXT: s_bfe_i32 s2, s2, 0x10000
4898 ; VI-NEXT: v_mov_b32_e32 v0, s0
4899 ; VI-NEXT: v_mov_b32_e32 v1, s1
4900 ; VI-NEXT: v_mov_b32_e32 v2, s2
4901 ; VI-NEXT: flat_store_dword v[0:1], v2
4904 ; GFX9-LABEL: i1_arg_sext_i32:
4906 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
4907 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
4908 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4909 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4910 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000
4911 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
4912 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4913 ; GFX9-NEXT: s_endpgm
4915 ; EG-LABEL: i1_arg_sext_i32:
4917 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
4919 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
4920 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
4923 ; EG-NEXT: Fetch clause starting at 6:
4924 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4925 ; EG-NEXT: ALU clause starting at 8:
4926 ; EG-NEXT: MOV * T0.X, 0.0,
4927 ; EG-NEXT: ALU clause starting at 9:
4928 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
4929 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4930 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4932 ; CM-LABEL: i1_arg_sext_i32:
4934 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
4936 ; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
4937 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
4940 ; CM-NEXT: Fetch clause starting at 6:
4941 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4942 ; CM-NEXT: ALU clause starting at 8:
4943 ; CM-NEXT: MOV * T0.X, 0.0,
4944 ; CM-NEXT: ALU clause starting at 9:
4945 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1,
4946 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4947 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4948 %ext = sext i1 %x to i32
4949 store i32 %ext, ptr addrspace(1) %out, align 4
4953 define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
4954 ; SI-LABEL: i1_arg_sext_i64:
4956 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
4957 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4958 ; SI-NEXT: s_mov_b32 s3, 0xf000
4959 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4960 ; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
4961 ; SI-NEXT: s_mov_b32 s2, -1
4962 ; SI-NEXT: v_mov_b32_e32 v0, s4
4963 ; SI-NEXT: v_mov_b32_e32 v1, s5
4964 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4967 ; VI-LABEL: i1_arg_sext_i64:
4969 ; VI-NEXT: s_load_dword s0, s[4:5], 0x2c
4970 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
4971 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4972 ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
4973 ; VI-NEXT: v_mov_b32_e32 v0, s2
4974 ; VI-NEXT: v_mov_b32_e32 v3, s1
4975 ; VI-NEXT: v_mov_b32_e32 v1, s3
4976 ; VI-NEXT: v_mov_b32_e32 v2, s0
4977 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
4980 ; GFX9-LABEL: i1_arg_sext_i64:
4982 ; GFX9-NEXT: s_load_dword s0, s[8:9], 0x8
4983 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
4984 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4985 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4986 ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
4987 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
4988 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
4989 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
4990 ; GFX9-NEXT: s_endpgm
4992 ; EG-LABEL: i1_arg_sext_i64:
4994 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
4996 ; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
4997 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5000 ; EG-NEXT: Fetch clause starting at 6:
5001 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5002 ; EG-NEXT: ALU clause starting at 8:
5003 ; EG-NEXT: MOV * T0.X, 0.0,
5004 ; EG-NEXT: ALU clause starting at 9:
5005 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
5006 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
5007 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5008 ; EG-NEXT: MOV * T0.Y, PV.X,
5010 ; CM-LABEL: i1_arg_sext_i64:
5012 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
5014 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
5015 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5018 ; CM-NEXT: Fetch clause starting at 6:
5019 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5020 ; CM-NEXT: ALU clause starting at 8:
5021 ; CM-NEXT: MOV * T0.X, 0.0,
5022 ; CM-NEXT: ALU clause starting at 9:
5023 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1,
5024 ; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
5025 ; CM-NEXT: MOV * T0.Y, PV.X,
5026 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5027 %ext = sext i1 %x to i64
5028 store i64 %ext, ptr addrspace(1) %out, align 8
5032 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
5033 ; SI-LABEL: empty_struct_arg:
5037 ; VI-LABEL: empty_struct_arg:
5041 ; GFX9-LABEL: empty_struct_arg:
5043 ; GFX9-NEXT: s_endpgm
5045 ; EGCM-LABEL: empty_struct_arg:
5052 ; The correct load offsets for these:
5058 ; With the SelectionDAG argument lowering, the alignments for the
5059 ; struct members is not properly considered, making these wrong.
5061 ; FIXME: Total argument size is computed wrong
5062 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
5063 ; SI-LABEL: struct_argument_alignment:
5065 ; SI-NEXT: s_load_dword s8, s[4:5], 0x9
5066 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xb
5067 ; SI-NEXT: s_load_dword s9, s[4:5], 0xf
5068 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11
5069 ; SI-NEXT: s_mov_b32 s0, 0
5070 ; SI-NEXT: s_mov_b32 s3, 0xf000
5071 ; SI-NEXT: s_mov_b32 s2, -1
5072 ; SI-NEXT: s_mov_b32 s1, s0
5073 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5074 ; SI-NEXT: v_mov_b32_e32 v0, s8
5075 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5076 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5077 ; SI-NEXT: v_mov_b32_e32 v0, s6
5078 ; SI-NEXT: v_mov_b32_e32 v1, s7
5079 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5080 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5081 ; SI-NEXT: v_mov_b32_e32 v0, s9
5082 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5083 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5084 ; SI-NEXT: v_mov_b32_e32 v0, s4
5085 ; SI-NEXT: v_mov_b32_e32 v1, s5
5086 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5087 ; SI-NEXT: s_waitcnt vmcnt(0)
5090 ; VI-LABEL: struct_argument_alignment:
5092 ; VI-NEXT: s_load_dword s6, s[4:5], 0x24
5093 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
5094 ; VI-NEXT: s_load_dword s7, s[4:5], 0x3c
5095 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x44
5096 ; VI-NEXT: v_mov_b32_e32 v0, 0
5097 ; VI-NEXT: v_mov_b32_e32 v1, 0
5098 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5099 ; VI-NEXT: v_mov_b32_e32 v2, s6
5100 ; VI-NEXT: flat_store_dword v[0:1], v2
5101 ; VI-NEXT: s_waitcnt vmcnt(0)
5102 ; VI-NEXT: v_mov_b32_e32 v3, s1
5103 ; VI-NEXT: v_mov_b32_e32 v2, s0
5104 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
5105 ; VI-NEXT: s_waitcnt vmcnt(0)
5106 ; VI-NEXT: v_mov_b32_e32 v2, s7
5107 ; VI-NEXT: flat_store_dword v[0:1], v2
5108 ; VI-NEXT: s_waitcnt vmcnt(0)
5109 ; VI-NEXT: v_mov_b32_e32 v2, s2
5110 ; VI-NEXT: v_mov_b32_e32 v3, s3
5111 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
5112 ; VI-NEXT: s_waitcnt vmcnt(0)
5115 ; GFX9-LABEL: struct_argument_alignment:
5117 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
5118 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
5119 ; GFX9-NEXT: s_load_dword s5, s[8:9], 0x18
5120 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x20
5121 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5122 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5124 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
5125 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
5126 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5127 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5128 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
5129 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
5130 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5131 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
5132 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
5133 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5134 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
5135 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
5136 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
5137 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5138 ; GFX9-NEXT: s_endpgm
5140 ; EG-LABEL: struct_argument_alignment:
5142 ; EG-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[]
5143 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0
5144 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
5145 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0
5146 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0
5147 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0
5148 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
5150 ; EG-NEXT: ALU clause starting at 8:
5151 ; EG-NEXT: MOV T0.X, KC0[4].Y,
5152 ; EG-NEXT: MOV * T1.X, KC0[4].Z,
5153 ; EG-NEXT: MOV T2.X, KC0[3].W,
5154 ; EG-NEXT: MOV * T3.X, KC0[2].W,
5155 ; EG-NEXT: MOV T4.X, literal.x,
5156 ; EG-NEXT: MOV * T5.X, KC0[3].X,
5157 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5158 ; EG-NEXT: MOV T6.X, literal.x,
5159 ; EG-NEXT: MOV * T7.X, KC0[2].Y,
5160 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5162 ; CM-LABEL: struct_argument_alignment:
5164 ; CM-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[]
5165 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X
5166 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X
5167 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X
5168 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X
5169 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5170 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X
5172 ; CM-NEXT: ALU clause starting at 8:
5173 ; CM-NEXT: MOV * T0.X, KC0[4].Y,
5174 ; CM-NEXT: MOV * T1.X, KC0[4].Z,
5175 ; CM-NEXT: MOV * T2.X, KC0[3].W,
5176 ; CM-NEXT: MOV * T3.X, KC0[2].W,
5177 ; CM-NEXT: MOV * T4.X, literal.x,
5178 ; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5179 ; CM-NEXT: MOV * T5.X, KC0[3].X,
5180 ; CM-NEXT: MOV * T6.X, literal.x,
5181 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5182 ; CM-NEXT: MOV * T7.X, KC0[2].Y,
5183 %val0 = extractvalue {i32, i64} %arg0, 0
5184 %val1 = extractvalue {i32, i64} %arg0, 1
5185 %val2 = extractvalue {i32, i64} %arg1, 0
5186 %val3 = extractvalue {i32, i64} %arg1, 1
5187 store volatile i32 %val0, ptr addrspace(1) null
5188 store volatile i64 %val1, ptr addrspace(1) null
5189 store volatile i32 %val2, ptr addrspace(1) null
5190 store volatile i64 %val3, ptr addrspace(1) null
5194 ; No padding between i8 and next struct, but round up at end to 4 byte
5196 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
5197 ; SI-LABEL: packed_struct_argument_alignment:
5199 ; SI-NEXT: s_mov_b32 s7, 0xf000
5200 ; SI-NEXT: s_mov_b32 s6, -1
5201 ; SI-NEXT: s_load_dword s2, s[4:5], 0x9
5202 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
5203 ; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
5204 ; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
5205 ; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
5206 ; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
5207 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
5208 ; SI-NEXT: s_mov_b32 s4, 0
5209 ; SI-NEXT: s_mov_b32 s5, s4
5210 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5211 ; SI-NEXT: v_mov_b32_e32 v2, s2
5212 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
5213 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5214 ; SI-NEXT: v_mov_b32_e32 v3, s1
5215 ; SI-NEXT: v_mov_b32_e32 v2, s0
5216 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
5217 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5218 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
5219 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
5220 ; SI-NEXT: v_or_b32_e32 v2, v2, v4
5221 ; SI-NEXT: v_or_b32_e32 v3, v3, v6
5222 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
5223 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
5224 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
5225 ; SI-NEXT: s_waitcnt vmcnt(0)
5226 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5227 ; SI-NEXT: s_waitcnt vmcnt(0)
5230 ; VI-LABEL: packed_struct_argument_alignment:
5232 ; VI-NEXT: s_add_u32 s0, s4, 49
5233 ; VI-NEXT: s_addc_u32 s1, s5, 0
5234 ; VI-NEXT: s_add_u32 s2, s4, 50
5235 ; VI-NEXT: s_addc_u32 s3, s5, 0
5236 ; VI-NEXT: v_mov_b32_e32 v3, s1
5237 ; VI-NEXT: v_mov_b32_e32 v2, s0
5238 ; VI-NEXT: s_add_u32 s0, s0, 3
5239 ; VI-NEXT: s_addc_u32 s1, s1, 0
5240 ; VI-NEXT: v_mov_b32_e32 v5, s1
5241 ; VI-NEXT: v_mov_b32_e32 v4, s0
5242 ; VI-NEXT: s_add_u32 s0, s4, 51
5243 ; VI-NEXT: s_addc_u32 s1, s5, 0
5244 ; VI-NEXT: v_mov_b32_e32 v0, s2
5245 ; VI-NEXT: v_mov_b32_e32 v7, s1
5246 ; VI-NEXT: v_mov_b32_e32 v1, s3
5247 ; VI-NEXT: v_mov_b32_e32 v6, s0
5248 ; VI-NEXT: flat_load_ubyte v8, v[0:1]
5249 ; VI-NEXT: flat_load_ubyte v9, v[2:3]
5250 ; VI-NEXT: flat_load_ubyte v10, v[4:5]
5251 ; VI-NEXT: flat_load_ubyte v6, v[6:7]
5252 ; VI-NEXT: s_add_u32 s0, s4, 53
5253 ; VI-NEXT: s_addc_u32 s1, s5, 0
5254 ; VI-NEXT: v_mov_b32_e32 v0, s0
5255 ; VI-NEXT: v_mov_b32_e32 v1, s1
5256 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
5257 ; VI-NEXT: s_load_dword s2, s[4:5], 0x24
5258 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
5259 ; VI-NEXT: v_mov_b32_e32 v2, 0
5260 ; VI-NEXT: v_mov_b32_e32 v3, 0
5261 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5262 ; VI-NEXT: v_mov_b32_e32 v7, s2
5263 ; VI-NEXT: v_mov_b32_e32 v5, s1
5264 ; VI-NEXT: v_mov_b32_e32 v4, s0
5265 ; VI-NEXT: flat_store_dword v[2:3], v7
5266 ; VI-NEXT: s_waitcnt vmcnt(0)
5267 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
5268 ; VI-NEXT: s_waitcnt vmcnt(0)
5269 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
5270 ; VI-NEXT: v_or_b32_e32 v4, v4, v9
5271 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
5272 ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5273 ; VI-NEXT: v_or_b32_e32 v4, v5, v4
5274 ; VI-NEXT: flat_store_dword v[2:3], v4
5275 ; VI-NEXT: s_waitcnt vmcnt(0)
5276 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5277 ; VI-NEXT: s_waitcnt vmcnt(0)
5280 ; GFX9-LABEL: packed_struct_argument_alignment:
5282 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5283 ; GFX9-NEXT: global_load_dword v6, v2, s[8:9] offset:13
5284 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[8:9] offset:17
5285 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
5286 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x4
5287 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5288 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
5289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5290 ; GFX9-NEXT: v_mov_b32_e32 v7, s2
5291 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
5292 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
5293 ; GFX9-NEXT: global_store_dword v[2:3], v7, off
5294 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5295 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
5296 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5297 ; GFX9-NEXT: global_store_dword v[2:3], v6, off
5298 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5299 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
5300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5301 ; GFX9-NEXT: s_endpgm
5303 ; EG-LABEL: packed_struct_argument_alignment:
5305 ; EG-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[]
5306 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
5307 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5308 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5309 ; EG-NEXT: ALU 2, @25, KC0[], KC1[]
5310 ; EG-NEXT: TEX 0 @12
5311 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5312 ; EG-NEXT: TEX 0 @14
5313 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5314 ; EG-NEXT: TEX 0 @16
5315 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1
5317 ; EG-NEXT: Fetch clause starting at 12:
5318 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3
5319 ; EG-NEXT: Fetch clause starting at 14:
5320 ; EG-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3
5321 ; EG-NEXT: Fetch clause starting at 16:
5322 ; EG-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3
5323 ; EG-NEXT: ALU clause starting at 18:
5324 ; EG-NEXT: MOV T0.X, KC0[2].Z,
5325 ; EG-NEXT: MOV * T1.X, literal.x,
5326 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5327 ; EG-NEXT: MOV T2.X, KC0[2].W,
5328 ; EG-NEXT: MOV * T3.X, literal.x,
5329 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5330 ; EG-NEXT: MOV * T4.X, KC0[2].Y,
5331 ; EG-NEXT: ALU clause starting at 25:
5332 ; EG-NEXT: MOV T0.X, 0.0,
5333 ; EG-NEXT: MOV * T2.X, 0.0,
5334 ; EG-NEXT: MOV * T4.X, 0.0,
5336 ; CM-LABEL: packed_struct_argument_alignment:
5338 ; CM-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[]
5339 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5340 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5341 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5342 ; CM-NEXT: ALU 2, @25, KC0[], KC1[]
5343 ; CM-NEXT: TEX 0 @12
5344 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5345 ; CM-NEXT: TEX 0 @14
5346 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5347 ; CM-NEXT: TEX 0 @16
5348 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5350 ; CM-NEXT: Fetch clause starting at 12:
5351 ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3
5352 ; CM-NEXT: Fetch clause starting at 14:
5353 ; CM-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3
5354 ; CM-NEXT: Fetch clause starting at 16:
5355 ; CM-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3
5356 ; CM-NEXT: ALU clause starting at 18:
5357 ; CM-NEXT: MOV * T0.X, KC0[2].Z,
5358 ; CM-NEXT: MOV * T1.X, literal.x,
5359 ; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5360 ; CM-NEXT: MOV * T2.X, KC0[2].W,
5361 ; CM-NEXT: MOV * T3.X, literal.x,
5362 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5363 ; CM-NEXT: MOV * T4.X, KC0[2].Y,
5364 ; CM-NEXT: ALU clause starting at 25:
5365 ; CM-NEXT: MOV * T0.X, 0.0,
5366 ; CM-NEXT: MOV * T2.X, 0.0,
5367 ; CM-NEXT: MOV * T4.X, 0.0,
5368 %val0 = extractvalue <{i32, i64}> %arg0, 0
5369 %val1 = extractvalue <{i32, i64}> %arg0, 1
5370 %val2 = extractvalue <{i32, i64}> %arg1, 0
5371 %val3 = extractvalue <{i32, i64}> %arg1, 1
5372 store volatile i32 %val0, ptr addrspace(1) null
5373 store volatile i64 %val1, ptr addrspace(1) null
5374 store volatile i32 %val2, ptr addrspace(1) null
5375 store volatile i64 %val3, ptr addrspace(1) null
5379 define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
5380 ; SI-LABEL: struct_argument_alignment_after:
5382 ; SI-NEXT: s_load_dword s12, s[4:5], 0x9
5383 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb
5384 ; SI-NEXT: s_load_dword s13, s[4:5], 0xf
5385 ; SI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x11
5386 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15
5387 ; SI-NEXT: s_mov_b32 s4, 0
5388 ; SI-NEXT: s_mov_b32 s7, 0xf000
5389 ; SI-NEXT: s_mov_b32 s6, -1
5390 ; SI-NEXT: s_mov_b32 s5, s4
5391 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5392 ; SI-NEXT: v_mov_b32_e32 v0, s12
5393 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5394 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5395 ; SI-NEXT: v_mov_b32_e32 v0, s8
5396 ; SI-NEXT: v_mov_b32_e32 v1, s9
5397 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5398 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5399 ; SI-NEXT: v_mov_b32_e32 v0, s13
5400 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5401 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5402 ; SI-NEXT: v_mov_b32_e32 v0, s10
5403 ; SI-NEXT: v_mov_b32_e32 v1, s11
5404 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5405 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5406 ; SI-NEXT: v_mov_b32_e32 v0, s0
5407 ; SI-NEXT: v_mov_b32_e32 v1, s1
5408 ; SI-NEXT: v_mov_b32_e32 v2, s2
5409 ; SI-NEXT: v_mov_b32_e32 v3, s3
5410 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5411 ; SI-NEXT: s_waitcnt vmcnt(0)
5414 ; VI-LABEL: struct_argument_alignment_after:
5416 ; VI-NEXT: s_load_dword s10, s[4:5], 0x24
5417 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c
5418 ; VI-NEXT: s_load_dword s11, s[4:5], 0x3c
5419 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
5420 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
5421 ; VI-NEXT: v_mov_b32_e32 v4, 0
5422 ; VI-NEXT: v_mov_b32_e32 v5, 0
5423 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5424 ; VI-NEXT: v_mov_b32_e32 v0, s10
5425 ; VI-NEXT: flat_store_dword v[4:5], v0
5426 ; VI-NEXT: s_waitcnt vmcnt(0)
5427 ; VI-NEXT: v_mov_b32_e32 v0, s6
5428 ; VI-NEXT: v_mov_b32_e32 v1, s7
5429 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
5430 ; VI-NEXT: s_waitcnt vmcnt(0)
5431 ; VI-NEXT: v_mov_b32_e32 v0, s11
5432 ; VI-NEXT: flat_store_dword v[4:5], v0
5433 ; VI-NEXT: s_waitcnt vmcnt(0)
5434 ; VI-NEXT: v_mov_b32_e32 v0, s8
5435 ; VI-NEXT: v_mov_b32_e32 v1, s9
5436 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
5437 ; VI-NEXT: s_waitcnt vmcnt(0)
5438 ; VI-NEXT: v_mov_b32_e32 v0, s0
5439 ; VI-NEXT: v_mov_b32_e32 v1, s1
5440 ; VI-NEXT: v_mov_b32_e32 v2, s2
5441 ; VI-NEXT: v_mov_b32_e32 v3, s3
5442 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
5443 ; VI-NEXT: s_waitcnt vmcnt(0)
5446 ; GFX9-LABEL: struct_argument_alignment_after:
5448 ; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
5449 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
5450 ; GFX9-NEXT: s_load_dword s11, s[8:9], 0x18
5451 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x20
5452 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x30
5453 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5454 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
5455 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5456 ; GFX9-NEXT: v_mov_b32_e32 v0, s10
5457 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
5458 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5459 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5460 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5461 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
5462 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5463 ; GFX9-NEXT: v_mov_b32_e32 v0, s11
5464 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
5465 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5466 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
5467 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
5468 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
5469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5470 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5471 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5472 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
5473 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
5474 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
5475 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5476 ; GFX9-NEXT: s_endpgm
5478 ; EG-LABEL: struct_argument_alignment_after:
5480 ; EG-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[]
5481 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0
5482 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0
5483 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
5484 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0
5485 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0
5486 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0
5487 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1
5490 ; EG-NEXT: ALU clause starting at 10:
5491 ; EG-NEXT: MOV * T0.W, KC0[6].X,
5492 ; EG-NEXT: MOV * T0.Z, KC0[5].W,
5493 ; EG-NEXT: MOV * T0.Y, KC0[5].Z,
5494 ; EG-NEXT: MOV T0.X, KC0[5].Y,
5495 ; EG-NEXT: MOV * T1.X, KC0[4].Y,
5496 ; EG-NEXT: MOV T2.X, KC0[4].Z,
5497 ; EG-NEXT: MOV * T3.X, KC0[3].W,
5498 ; EG-NEXT: MOV T4.X, KC0[2].W,
5499 ; EG-NEXT: MOV * T5.X, literal.x,
5500 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5501 ; EG-NEXT: MOV T6.X, KC0[3].X,
5502 ; EG-NEXT: MOV * T7.X, literal.x,
5503 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5504 ; EG-NEXT: MOV * T8.X, KC0[2].Y,
5506 ; CM-LABEL: struct_argument_alignment_after:
5508 ; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[]
5509 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X
5510 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X
5511 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X
5512 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X
5513 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X
5514 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X
5515 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T7.X
5518 ; CM-NEXT: ALU clause starting at 10:
5519 ; CM-NEXT: MOV * T0.W, KC0[6].X,
5520 ; CM-NEXT: MOV * T0.Z, KC0[5].W,
5521 ; CM-NEXT: MOV * T0.Y, KC0[5].Z,
5522 ; CM-NEXT: MOV * T0.X, KC0[5].Y,
5523 ; CM-NEXT: MOV * T1.X, KC0[4].Y,
5524 ; CM-NEXT: MOV * T2.X, KC0[4].Z,
5525 ; CM-NEXT: MOV * T3.X, KC0[3].W,
5526 ; CM-NEXT: MOV * T4.X, KC0[2].W,
5527 ; CM-NEXT: MOV * T5.X, literal.x,
5528 ; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5529 ; CM-NEXT: MOV * T6.X, KC0[3].X,
5530 ; CM-NEXT: MOV * T7.X, literal.x,
5531 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5532 ; CM-NEXT: MOV * T8.X, KC0[2].Y,
5533 %val0 = extractvalue {i32, i64} %arg0, 0
5534 %val1 = extractvalue {i32, i64} %arg0, 1
5535 %val2 = extractvalue {i32, i64} %arg2, 0
5536 %val3 = extractvalue {i32, i64} %arg2, 1
5537 store volatile i32 %val0, ptr addrspace(1) null
5538 store volatile i64 %val1, ptr addrspace(1) null
5539 store volatile i32 %val2, ptr addrspace(1) null
5540 store volatile i64 %val3, ptr addrspace(1) null
5541 store volatile <4 x i32> %arg4, ptr addrspace(1) null
5545 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
5546 ; SI-LABEL: array_3xi32:
5548 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5549 ; SI-NEXT: s_mov_b32 s7, 0xf000
5550 ; SI-NEXT: s_mov_b32 s6, -1
5551 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5552 ; SI-NEXT: v_mov_b32_e32 v0, s0
5553 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
5554 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5555 ; SI-NEXT: v_mov_b32_e32 v0, s3
5556 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5557 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5558 ; SI-NEXT: v_mov_b32_e32 v0, s2
5559 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5560 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5561 ; SI-NEXT: v_mov_b32_e32 v0, s1
5562 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5563 ; SI-NEXT: s_waitcnt vmcnt(0)
5566 ; VI-LABEL: array_3xi32:
5568 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5569 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5570 ; VI-NEXT: v_mov_b32_e32 v0, s0
5571 ; VI-NEXT: v_mov_b32_e32 v1, s3
5572 ; VI-NEXT: v_mov_b32_e32 v2, s2
5573 ; VI-NEXT: flat_store_short v[0:1], v0
5574 ; VI-NEXT: s_waitcnt vmcnt(0)
5575 ; VI-NEXT: flat_store_dword v[0:1], v1
5576 ; VI-NEXT: s_waitcnt vmcnt(0)
5577 ; VI-NEXT: flat_store_dword v[0:1], v2
5578 ; VI-NEXT: s_waitcnt vmcnt(0)
5579 ; VI-NEXT: v_mov_b32_e32 v0, s1
5580 ; VI-NEXT: flat_store_dword v[0:1], v0
5581 ; VI-NEXT: s_waitcnt vmcnt(0)
5584 ; GFX9-LABEL: array_3xi32:
5586 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
5587 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5588 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5589 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5590 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
5591 ; GFX9-NEXT: global_store_short v[0:1], v0, off
5592 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5593 ; GFX9-NEXT: global_store_dword v[0:1], v1, off
5594 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5595 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
5596 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5597 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
5598 ; GFX9-NEXT: global_store_dword v[0:1], v0, off
5599 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5600 ; GFX9-NEXT: s_endpgm
5602 ; EG-LABEL: array_3xi32:
5604 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
5606 ; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
5607 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T4.X
5608 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0
5609 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
5610 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1
5612 ; EG-NEXT: Fetch clause starting at 8:
5613 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
5614 ; EG-NEXT: ALU clause starting at 10:
5615 ; EG-NEXT: MOV * T0.X, 0.0,
5616 ; EG-NEXT: ALU clause starting at 11:
5617 ; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
5618 ; EG-NEXT: MOV * T0.W, literal.x,
5619 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5620 ; EG-NEXT: MOV T0.Y, 0.0,
5621 ; EG-NEXT: MOV * T0.Z, 0.0,
5622 ; EG-NEXT: MOV T1.X, KC0[2].Z,
5623 ; EG-NEXT: MOV * T2.X, KC0[2].W,
5624 ; EG-NEXT: MOV T3.X, KC0[3].X,
5625 ; EG-NEXT: MOV * T4.X, literal.x,
5626 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5628 ; CM-LABEL: array_3xi32:
5630 ; CM-NEXT: ALU 0, @10, KC0[], KC1[]
5632 ; CM-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
5633 ; CM-NEXT: MEM_RAT MSKOR T0.XW, T4.X
5634 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X
5635 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X
5636 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5638 ; CM-NEXT: Fetch clause starting at 8:
5639 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
5640 ; CM-NEXT: ALU clause starting at 10:
5641 ; CM-NEXT: MOV * T0.X, 0.0,
5642 ; CM-NEXT: ALU clause starting at 11:
5643 ; CM-NEXT: AND_INT T0.X, T0.X, literal.x,
5644 ; CM-NEXT: MOV * T0.W, literal.x,
5645 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5646 ; CM-NEXT: MOV T0.Y, 0.0,
5647 ; CM-NEXT: MOV * T0.Z, 0.0,
5648 ; CM-NEXT: MOV * T1.X, KC0[2].Z,
5649 ; CM-NEXT: MOV * T2.X, KC0[2].W,
5650 ; CM-NEXT: MOV * T3.X, KC0[3].X,
5651 ; CM-NEXT: MOV * T4.X, literal.x,
5652 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5653 store volatile i16 %arg0, ptr addrspace(1) undef
5654 store volatile [3 x i32] %arg1, ptr addrspace(1) undef
5658 ; FIXME: Why not all scalar loads?
5659 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
5660 ; SI-LABEL: array_3xi16:
5662 ; SI-NEXT: s_load_dword s0, s[4:5], 0x9
5663 ; SI-NEXT: s_mov_b32 s7, 0xf000
5664 ; SI-NEXT: s_mov_b32 s6, -1
5665 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:42
5666 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:40
5667 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:38
5668 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5669 ; SI-NEXT: v_mov_b32_e32 v3, s0
5670 ; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0
5671 ; SI-NEXT: s_waitcnt vmcnt(0)
5672 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
5673 ; SI-NEXT: s_waitcnt vmcnt(0)
5674 ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0
5675 ; SI-NEXT: s_waitcnt vmcnt(0)
5676 ; SI-NEXT: buffer_store_short v2, off, s[4:7], 0
5677 ; SI-NEXT: s_waitcnt vmcnt(0)
5680 ; VI-LABEL: array_3xi16:
5682 ; VI-NEXT: s_add_u32 s0, s4, 38
5683 ; VI-NEXT: s_addc_u32 s1, s5, 0
5684 ; VI-NEXT: s_add_u32 s2, s0, 2
5685 ; VI-NEXT: s_addc_u32 s3, s1, 0
5686 ; VI-NEXT: v_mov_b32_e32 v0, s0
5687 ; VI-NEXT: v_mov_b32_e32 v1, s1
5688 ; VI-NEXT: s_add_u32 s0, s4, 42
5689 ; VI-NEXT: s_addc_u32 s1, s5, 0
5690 ; VI-NEXT: v_mov_b32_e32 v3, s1
5691 ; VI-NEXT: v_mov_b32_e32 v2, s0
5692 ; VI-NEXT: flat_load_ushort v4, v[0:1]
5693 ; VI-NEXT: flat_load_ushort v2, v[2:3]
5694 ; VI-NEXT: v_mov_b32_e32 v0, s2
5695 ; VI-NEXT: v_mov_b32_e32 v1, s3
5696 ; VI-NEXT: flat_load_ushort v0, v[0:1]
5697 ; VI-NEXT: s_load_dword s0, s[4:5], 0x24
5698 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5699 ; VI-NEXT: v_mov_b32_e32 v1, s0
5700 ; VI-NEXT: s_waitcnt vmcnt(0)
5701 ; VI-NEXT: flat_store_byte v[0:1], v1
5702 ; VI-NEXT: s_waitcnt vmcnt(0)
5703 ; VI-NEXT: flat_store_short v[0:1], v2
5704 ; VI-NEXT: s_waitcnt vmcnt(0)
5705 ; VI-NEXT: flat_store_short v[0:1], v4
5706 ; VI-NEXT: s_waitcnt vmcnt(0)
5707 ; VI-NEXT: flat_store_short v[0:1], v0
5708 ; VI-NEXT: s_waitcnt vmcnt(0)
5711 ; GFX9-LABEL: array_3xi16:
5713 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5714 ; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] offset:6
5715 ; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] offset:4
5716 ; GFX9-NEXT: global_load_ushort v3, v0, s[8:9] offset:2
5717 ; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
5718 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5719 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5720 ; GFX9-NEXT: s_waitcnt vmcnt(2)
5721 ; GFX9-NEXT: global_store_byte v[0:1], v0, off
5722 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5723 ; GFX9-NEXT: global_store_short v[0:1], v1, off
5724 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5725 ; GFX9-NEXT: global_store_short v[0:1], v2, off
5726 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5727 ; GFX9-NEXT: global_store_short v[0:1], v3, off
5728 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5729 ; GFX9-NEXT: s_endpgm
5731 ; EG-LABEL: array_3xi16:
5733 ; EG-NEXT: ALU 0, @20, KC0[], KC1[]
5734 ; EG-NEXT: TEX 1 @12
5735 ; EG-NEXT: ALU 11, @21, KC0[], KC1[]
5736 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X
5737 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5738 ; EG-NEXT: TEX 0 @16
5739 ; EG-NEXT: ALU 3, @33, KC0[], KC1[]
5740 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5741 ; EG-NEXT: TEX 0 @18
5742 ; EG-NEXT: ALU 3, @37, KC0[], KC1[]
5743 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5745 ; EG-NEXT: Fetch clause starting at 12:
5746 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3
5747 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3
5748 ; EG-NEXT: Fetch clause starting at 16:
5749 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
5750 ; EG-NEXT: Fetch clause starting at 18:
5751 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3
5752 ; EG-NEXT: ALU clause starting at 20:
5753 ; EG-NEXT: MOV * T0.X, 0.0,
5754 ; EG-NEXT: ALU clause starting at 21:
5755 ; EG-NEXT: AND_INT T1.X, T1.X, literal.x,
5756 ; EG-NEXT: MOV * T1.W, literal.x,
5757 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
5758 ; EG-NEXT: MOV * T1.Y, 0.0,
5759 ; EG-NEXT: AND_INT T2.X, T2.X, literal.x,
5760 ; EG-NEXT: MOV * T2.W, literal.x,
5761 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5762 ; EG-NEXT: MOV T2.Y, 0.0,
5763 ; EG-NEXT: MOV T1.Z, 0.0,
5764 ; EG-NEXT: MOV * T2.Z, 0.0,
5765 ; EG-NEXT: MOV * T3.X, literal.x,
5766 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5767 ; EG-NEXT: ALU clause starting at 33:
5768 ; EG-NEXT: AND_INT T2.X, T1.X, literal.x,
5769 ; EG-NEXT: MOV T2.Y, 0.0,
5770 ; EG-NEXT: MOV * T2.Z, 0.0,
5771 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5772 ; EG-NEXT: ALU clause starting at 37:
5773 ; EG-NEXT: AND_INT T2.X, T0.X, literal.x,
5774 ; EG-NEXT: MOV T2.Y, 0.0,
5775 ; EG-NEXT: MOV * T2.Z, 0.0,
5776 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5778 ; CM-LABEL: array_3xi16:
5780 ; CM-NEXT: ALU 0, @20, KC0[], KC1[]
5781 ; CM-NEXT: TEX 1 @12
5782 ; CM-NEXT: ALU 11, @21, KC0[], KC1[]
5783 ; CM-NEXT: MEM_RAT MSKOR T1.XW, T3.X
5784 ; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5785 ; CM-NEXT: TEX 0 @16
5786 ; CM-NEXT: ALU 3, @33, KC0[], KC1[]
5787 ; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5788 ; CM-NEXT: TEX 0 @18
5789 ; CM-NEXT: ALU 3, @37, KC0[], KC1[]
5790 ; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5792 ; CM-NEXT: Fetch clause starting at 12:
5793 ; CM-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3
5794 ; CM-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3
5795 ; CM-NEXT: Fetch clause starting at 16:
5796 ; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
5797 ; CM-NEXT: Fetch clause starting at 18:
5798 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3
5799 ; CM-NEXT: ALU clause starting at 20:
5800 ; CM-NEXT: MOV * T0.X, 0.0,
5801 ; CM-NEXT: ALU clause starting at 21:
5802 ; CM-NEXT: AND_INT T1.X, T1.X, literal.x,
5803 ; CM-NEXT: MOV * T1.W, literal.x,
5804 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
5805 ; CM-NEXT: MOV * T1.Y, 0.0,
5806 ; CM-NEXT: AND_INT T2.X, T2.X, literal.x,
5807 ; CM-NEXT: MOV * T2.W, literal.x,
5808 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5809 ; CM-NEXT: MOV T2.Y, 0.0,
5810 ; CM-NEXT: MOV * T1.Z, 0.0,
5811 ; CM-NEXT: MOV * T2.Z, 0.0,
5812 ; CM-NEXT: MOV * T3.X, literal.x,
5813 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5814 ; CM-NEXT: ALU clause starting at 33:
5815 ; CM-NEXT: AND_INT T2.X, T1.X, literal.x,
5816 ; CM-NEXT: MOV T2.Y, 0.0,
5817 ; CM-NEXT: MOV * T2.Z, 0.0,
5818 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5819 ; CM-NEXT: ALU clause starting at 37:
5820 ; CM-NEXT: AND_INT T2.X, T0.X, literal.x,
5821 ; CM-NEXT: MOV T2.Y, 0.0,
5822 ; CM-NEXT: MOV * T2.Z, 0.0,
5823 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5824 store volatile i8 %arg0, ptr addrspace(1) undef
5825 store volatile [3 x i16] %arg1, ptr addrspace(1) undef
5829 define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
5830 ; SI-LABEL: small_array_round_down_offset:
5832 ; SI-NEXT: s_mov_b32 s7, 0xf000
5833 ; SI-NEXT: s_mov_b32 s6, -1
5834 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:37
5835 ; SI-NEXT: s_waitcnt vmcnt(0)
5836 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
5837 ; SI-NEXT: s_waitcnt vmcnt(0)
5840 ; VI-LABEL: small_array_round_down_offset:
5842 ; VI-NEXT: s_add_u32 s0, s4, 37
5843 ; VI-NEXT: s_addc_u32 s1, s5, 0
5844 ; VI-NEXT: v_mov_b32_e32 v0, s0
5845 ; VI-NEXT: v_mov_b32_e32 v1, s1
5846 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
5847 ; VI-NEXT: s_waitcnt vmcnt(0)
5848 ; VI-NEXT: flat_store_byte v[0:1], v0
5849 ; VI-NEXT: s_waitcnt vmcnt(0)
5852 ; GFX9-LABEL: small_array_round_down_offset:
5854 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5855 ; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] offset:1
5856 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5857 ; GFX9-NEXT: global_store_byte v[0:1], v0, off
5858 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5859 ; GFX9-NEXT: s_endpgm
5861 ; EGCM-LABEL: small_array_round_down_offset:
5863 ; EGCM-NEXT: ALU 0, @8, KC0[], KC1[]
5864 ; EGCM-NEXT: TEX 0 @6
5865 ; EGCM-NEXT: ALU 6, @9, KC0[], KC1[]
5866 ; EGCM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
5869 ; EGCM-NEXT: Fetch clause starting at 6:
5870 ; EGCM-NEXT: VTX_READ_8 T0.X, T0.X, 37, #3
5871 ; EGCM-NEXT: ALU clause starting at 8:
5872 ; EGCM-NEXT: MOV * T0.X, 0.0,
5873 ; EGCM-NEXT: ALU clause starting at 9:
5874 ; EGCM-NEXT: AND_INT T0.X, T0.X, literal.x,
5875 ; EGCM-NEXT: MOV * T0.W, literal.x,
5876 ; EGCM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
5877 ; EGCM-NEXT: MOV T0.Y, 0.0,
5878 ; EGCM-NEXT: MOV * T0.Z, 0.0,
5879 ; EGCM-NEXT: MOV * T1.X, literal.x,
5880 ; EGCM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5881 %val = extractvalue [1 x i8] %arg, 0
5882 store volatile i8 %val, ptr addrspace(1) undef
5886 define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
5887 ; SI-LABEL: byref_align_constant_i32_arg:
5889 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x49
5890 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
5891 ; SI-NEXT: s_mov_b32 s3, 0xf000
5892 ; SI-NEXT: s_mov_b32 s2, -1
5893 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5894 ; SI-NEXT: v_mov_b32_e32 v0, s6
5895 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5896 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5897 ; SI-NEXT: v_mov_b32_e32 v0, s7
5898 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5899 ; SI-NEXT: s_waitcnt vmcnt(0)
5902 ; VI-LABEL: byref_align_constant_i32_arg:
5904 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
5905 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x124
5906 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5907 ; VI-NEXT: v_mov_b32_e32 v0, s0
5908 ; VI-NEXT: v_mov_b32_e32 v1, s1
5909 ; VI-NEXT: v_mov_b32_e32 v2, s2
5910 ; VI-NEXT: v_mov_b32_e32 v3, s3
5911 ; VI-NEXT: flat_store_dword v[0:1], v2
5912 ; VI-NEXT: s_waitcnt vmcnt(0)
5913 ; VI-NEXT: flat_store_dword v[0:1], v3
5914 ; VI-NEXT: s_waitcnt vmcnt(0)
5917 ; GFX9-LABEL: byref_align_constant_i32_arg:
5919 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x100
5920 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
5921 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5922 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5923 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
5924 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
5925 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
5926 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5927 ; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
5928 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5929 ; GFX9-NEXT: s_endpgm
5931 ; EG-LABEL: byref_align_constant_i32_arg:
5933 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
5935 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
5936 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0
5937 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
5939 ; EG-NEXT: Fetch clause starting at 6:
5940 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
5941 ; EG-NEXT: ALU clause starting at 8:
5942 ; EG-NEXT: MOV * T0.X, KC0[18].Y,
5943 ; EG-NEXT: ALU clause starting at 9:
5944 ; EG-NEXT: MOV T1.X, KC0[18].Z,
5945 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
5946 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5948 ; CM-LABEL: byref_align_constant_i32_arg:
5950 ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
5952 ; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
5953 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
5954 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
5956 ; CM-NEXT: Fetch clause starting at 6:
5957 ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
5958 ; CM-NEXT: ALU clause starting at 8:
5959 ; CM-NEXT: MOV * T0.X, KC0[18].Y,
5960 ; CM-NEXT: ALU clause starting at 9:
5961 ; CM-NEXT: MOV * T1.X, KC0[18].Z,
5962 ; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
5963 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5964 %in = load i32, ptr addrspace(4) %in.byref
5965 store volatile i32 %in, ptr addrspace(1) %out, align 4
5966 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
5970 define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
5971 ; SI-LABEL: byref_natural_align_constant_v16i32_arg:
5973 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
5974 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
5975 ; SI-NEXT: s_load_dword s4, s[4:5], 0x29
5976 ; SI-NEXT: s_mov_b32 s3, 0xf000
5977 ; SI-NEXT: s_mov_b32 s2, -1
5978 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5979 ; SI-NEXT: v_mov_b32_e32 v0, s20
5980 ; SI-NEXT: v_mov_b32_e32 v1, s21
5981 ; SI-NEXT: v_mov_b32_e32 v2, s22
5982 ; SI-NEXT: v_mov_b32_e32 v3, s23
5983 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
5984 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5985 ; SI-NEXT: v_mov_b32_e32 v0, s16
5986 ; SI-NEXT: v_mov_b32_e32 v1, s17
5987 ; SI-NEXT: v_mov_b32_e32 v2, s18
5988 ; SI-NEXT: v_mov_b32_e32 v3, s19
5989 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
5990 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5991 ; SI-NEXT: v_mov_b32_e32 v0, s12
5992 ; SI-NEXT: v_mov_b32_e32 v1, s13
5993 ; SI-NEXT: v_mov_b32_e32 v2, s14
5994 ; SI-NEXT: v_mov_b32_e32 v3, s15
5995 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
5996 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5997 ; SI-NEXT: v_mov_b32_e32 v0, s8
5998 ; SI-NEXT: v_mov_b32_e32 v1, s9
5999 ; SI-NEXT: v_mov_b32_e32 v2, s10
6000 ; SI-NEXT: v_mov_b32_e32 v3, s11
6001 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
6002 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6003 ; SI-NEXT: v_mov_b32_e32 v0, s4
6004 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6005 ; SI-NEXT: s_waitcnt vmcnt(0)
6008 ; VI-LABEL: byref_natural_align_constant_v16i32_arg:
6010 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
6011 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
6012 ; VI-NEXT: s_load_dword s4, s[4:5], 0xa4
6013 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6014 ; VI-NEXT: v_mov_b32_e32 v0, s20
6015 ; VI-NEXT: s_add_u32 s2, s0, 48
6016 ; VI-NEXT: s_addc_u32 s3, s1, 0
6017 ; VI-NEXT: v_mov_b32_e32 v5, s3
6018 ; VI-NEXT: v_mov_b32_e32 v4, s2
6019 ; VI-NEXT: s_add_u32 s2, s0, 32
6020 ; VI-NEXT: v_mov_b32_e32 v1, s21
6021 ; VI-NEXT: v_mov_b32_e32 v2, s22
6022 ; VI-NEXT: v_mov_b32_e32 v3, s23
6023 ; VI-NEXT: s_addc_u32 s3, s1, 0
6024 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6025 ; VI-NEXT: s_waitcnt vmcnt(0)
6026 ; VI-NEXT: v_mov_b32_e32 v5, s3
6027 ; VI-NEXT: v_mov_b32_e32 v4, s2
6028 ; VI-NEXT: s_add_u32 s2, s0, 16
6029 ; VI-NEXT: v_mov_b32_e32 v0, s16
6030 ; VI-NEXT: v_mov_b32_e32 v1, s17
6031 ; VI-NEXT: v_mov_b32_e32 v2, s18
6032 ; VI-NEXT: v_mov_b32_e32 v3, s19
6033 ; VI-NEXT: s_addc_u32 s3, s1, 0
6034 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6035 ; VI-NEXT: s_waitcnt vmcnt(0)
6036 ; VI-NEXT: v_mov_b32_e32 v5, s3
6037 ; VI-NEXT: v_mov_b32_e32 v0, s12
6038 ; VI-NEXT: v_mov_b32_e32 v1, s13
6039 ; VI-NEXT: v_mov_b32_e32 v2, s14
6040 ; VI-NEXT: v_mov_b32_e32 v3, s15
6041 ; VI-NEXT: v_mov_b32_e32 v4, s2
6042 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6043 ; VI-NEXT: s_waitcnt vmcnt(0)
6044 ; VI-NEXT: v_mov_b32_e32 v5, s1
6045 ; VI-NEXT: v_mov_b32_e32 v0, s8
6046 ; VI-NEXT: v_mov_b32_e32 v1, s9
6047 ; VI-NEXT: v_mov_b32_e32 v2, s10
6048 ; VI-NEXT: v_mov_b32_e32 v3, s11
6049 ; VI-NEXT: v_mov_b32_e32 v4, s0
6050 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6051 ; VI-NEXT: s_waitcnt vmcnt(0)
6052 ; VI-NEXT: v_mov_b32_e32 v0, s4
6053 ; VI-NEXT: flat_store_dword v[4:5], v0
6054 ; VI-NEXT: s_waitcnt vmcnt(0)
6057 ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg:
6059 ; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
6060 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
6061 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x80
6062 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
6063 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6064 ; GFX9-NEXT: v_mov_b32_e32 v0, s24
6065 ; GFX9-NEXT: v_mov_b32_e32 v1, s25
6066 ; GFX9-NEXT: v_mov_b32_e32 v2, s26
6067 ; GFX9-NEXT: v_mov_b32_e32 v3, s27
6068 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
6069 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6070 ; GFX9-NEXT: v_mov_b32_e32 v0, s20
6071 ; GFX9-NEXT: v_mov_b32_e32 v1, s21
6072 ; GFX9-NEXT: v_mov_b32_e32 v2, s22
6073 ; GFX9-NEXT: v_mov_b32_e32 v3, s23
6074 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
6075 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6076 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
6077 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
6078 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
6079 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
6080 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
6081 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6082 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
6083 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
6084 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
6085 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
6086 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
6087 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6088 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6089 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
6090 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6091 ; GFX9-NEXT: s_endpgm
6093 ; EG-LABEL: byref_natural_align_constant_v16i32_arg:
6095 ; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
6096 ; EG-NEXT: TEX 0 @16
6097 ; EG-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[]
6098 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
6099 ; EG-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[]
6100 ; EG-NEXT: TEX 0 @18
6101 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6102 ; EG-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[]
6103 ; EG-NEXT: TEX 0 @20
6104 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6105 ; EG-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[]
6106 ; EG-NEXT: TEX 0 @22
6107 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
6108 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6111 ; EG-NEXT: Fetch clause starting at 16:
6112 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
6113 ; EG-NEXT: Fetch clause starting at 18:
6114 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
6115 ; EG-NEXT: Fetch clause starting at 20:
6116 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
6117 ; EG-NEXT: Fetch clause starting at 22:
6118 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
6119 ; EG-NEXT: ALU clause starting at 24:
6120 ; EG-NEXT: MOV * T0.X, KC0[6].Y,
6121 ; EG-NEXT: ALU clause starting at 25:
6122 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6123 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
6124 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
6125 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6126 ; EG-NEXT: ALU clause starting at 29:
6127 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6128 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
6129 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
6130 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6131 ; EG-NEXT: ALU clause starting at 33:
6132 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6133 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
6134 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
6135 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6136 ; EG-NEXT: ALU clause starting at 37:
6137 ; EG-NEXT: MOV T1.X, KC0[10].Y,
6138 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
6139 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6141 ; CM-LABEL: byref_natural_align_constant_v16i32_arg:
6143 ; CM-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
6144 ; CM-NEXT: TEX 0 @16
6145 ; CM-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[]
6146 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
6147 ; CM-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[]
6148 ; CM-NEXT: TEX 0 @18
6149 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6150 ; CM-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[]
6151 ; CM-NEXT: TEX 0 @20
6152 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6153 ; CM-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[]
6154 ; CM-NEXT: TEX 0 @22
6155 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
6156 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6159 ; CM-NEXT: Fetch clause starting at 16:
6160 ; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
6161 ; CM-NEXT: Fetch clause starting at 18:
6162 ; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
6163 ; CM-NEXT: Fetch clause starting at 20:
6164 ; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
6165 ; CM-NEXT: Fetch clause starting at 22:
6166 ; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
6167 ; CM-NEXT: ALU clause starting at 24:
6168 ; CM-NEXT: MOV * T0.X, KC0[6].Y,
6169 ; CM-NEXT: ALU clause starting at 25:
6170 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6171 ; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
6172 ; CM-NEXT: LSHR * T2.X, PV.W, literal.x,
6173 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6174 ; CM-NEXT: ALU clause starting at 29:
6175 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6176 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
6177 ; CM-NEXT: LSHR * T1.X, PV.W, literal.x,
6178 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6179 ; CM-NEXT: ALU clause starting at 33:
6180 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6181 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
6182 ; CM-NEXT: LSHR * T1.X, PV.W, literal.x,
6183 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6184 ; CM-NEXT: ALU clause starting at 37:
6185 ; CM-NEXT: MOV * T1.X, KC0[10].Y,
6186 ; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
6187 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6188 %in = load <16 x i32>, ptr addrspace(4) %in.byref
6189 store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
6190 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4