1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s
4 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s
6 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s
8 define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
11 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
12 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
13 ; SI-NEXT: s_mov_b32 s3, 0xf000
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_and_b32 s4, s2, 0xff
16 ; SI-NEXT: s_mov_b32 s2, -1
17 ; SI-NEXT: v_mov_b32_e32 v0, s4
18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
23 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: s_and_b32 s2, s2, 0xff
27 ; VI-NEXT: v_mov_b32_e32 v0, s0
28 ; VI-NEXT: v_mov_b32_e32 v1, s1
29 ; VI-NEXT: v_mov_b32_e32 v2, s2
30 ; VI-NEXT: flat_store_dword v[0:1], v2
35 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
36 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
37 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
38 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff
40 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
41 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
46 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
48 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
49 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
52 ; EG-NEXT: Fetch clause starting at 6:
53 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
54 ; EG-NEXT: ALU clause starting at 8:
55 ; EG-NEXT: MOV * T0.X, 0.0,
56 ; EG-NEXT: ALU clause starting at 9:
57 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
58 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
62 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
64 ; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
65 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
68 ; CM-NEXT: Fetch clause starting at 6:
69 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
70 ; CM-NEXT: ALU clause starting at 8:
71 ; CM-NEXT: MOV * T0.X, 0.0,
72 ; CM-NEXT: ALU clause starting at 9:
73 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
74 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
75 %ext = zext i8 %in to i32
76 store i32 %ext, ptr addrspace(1) %out, align 4
80 define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind {
81 ; SI-LABEL: i8_zext_arg:
83 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
84 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
85 ; SI-NEXT: s_mov_b32 s3, 0xf000
86 ; SI-NEXT: s_waitcnt lgkmcnt(0)
87 ; SI-NEXT: s_and_b32 s4, s2, 0xff
88 ; SI-NEXT: s_mov_b32 s2, -1
89 ; SI-NEXT: v_mov_b32_e32 v0, s4
90 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
93 ; VI-LABEL: i8_zext_arg:
95 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
96 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
97 ; VI-NEXT: s_waitcnt lgkmcnt(0)
98 ; VI-NEXT: s_and_b32 s2, s2, 0xff
99 ; VI-NEXT: v_mov_b32_e32 v0, s0
100 ; VI-NEXT: v_mov_b32_e32 v1, s1
101 ; VI-NEXT: v_mov_b32_e32 v2, s2
102 ; VI-NEXT: flat_store_dword v[0:1], v2
105 ; GFX9-LABEL: i8_zext_arg:
107 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
108 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
109 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff
112 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
113 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
114 ; GFX9-NEXT: s_endpgm
116 ; EG-LABEL: i8_zext_arg:
118 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
120 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
121 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
124 ; EG-NEXT: Fetch clause starting at 6:
125 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
126 ; EG-NEXT: ALU clause starting at 8:
127 ; EG-NEXT: MOV * T0.X, 0.0,
128 ; EG-NEXT: ALU clause starting at 9:
129 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
130 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
131 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
133 ; CM-LABEL: i8_zext_arg:
135 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
137 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
138 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
141 ; CM-NEXT: Fetch clause starting at 6:
142 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
143 ; CM-NEXT: ALU clause starting at 8:
144 ; CM-NEXT: MOV * T0.X, 0.0,
145 ; CM-NEXT: ALU clause starting at 9:
146 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
147 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
148 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
149 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
150 %ext = zext i8 %in to i32
151 store i32 %ext, ptr addrspace(1) %out, align 4
155 define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind {
156 ; SI-LABEL: i8_sext_arg:
158 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
159 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
160 ; SI-NEXT: s_mov_b32 s3, 0xf000
161 ; SI-NEXT: s_waitcnt lgkmcnt(0)
162 ; SI-NEXT: s_sext_i32_i8 s4, s2
163 ; SI-NEXT: s_mov_b32 s2, -1
164 ; SI-NEXT: v_mov_b32_e32 v0, s4
165 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
168 ; VI-LABEL: i8_sext_arg:
170 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
171 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
173 ; VI-NEXT: s_sext_i32_i8 s2, s2
174 ; VI-NEXT: v_mov_b32_e32 v0, s0
175 ; VI-NEXT: v_mov_b32_e32 v1, s1
176 ; VI-NEXT: v_mov_b32_e32 v2, s2
177 ; VI-NEXT: flat_store_dword v[0:1], v2
180 ; GFX9-LABEL: i8_sext_arg:
182 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
183 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
184 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
185 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
186 ; GFX9-NEXT: s_sext_i32_i8 s2, s2
187 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
188 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
189 ; GFX9-NEXT: s_endpgm
191 ; EG-LABEL: i8_sext_arg:
193 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
195 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
196 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
199 ; EG-NEXT: Fetch clause starting at 6:
200 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
201 ; EG-NEXT: ALU clause starting at 8:
202 ; EG-NEXT: MOV * T0.X, 0.0,
203 ; EG-NEXT: ALU clause starting at 9:
204 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
205 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
206 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
208 ; CM-LABEL: i8_sext_arg:
210 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
212 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
213 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
216 ; CM-NEXT: Fetch clause starting at 6:
217 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
218 ; CM-NEXT: ALU clause starting at 8:
219 ; CM-NEXT: MOV * T0.X, 0.0,
220 ; CM-NEXT: ALU clause starting at 9:
221 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
222 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
223 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
224 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
225 %ext = sext i8 %in to i32
226 store i32 %ext, ptr addrspace(1) %out, align 4
230 define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind {
233 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
234 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
235 ; SI-NEXT: s_mov_b32 s3, 0xf000
236 ; SI-NEXT: s_waitcnt lgkmcnt(0)
237 ; SI-NEXT: s_and_b32 s4, s2, 0xffff
238 ; SI-NEXT: s_mov_b32 s2, -1
239 ; SI-NEXT: v_mov_b32_e32 v0, s4
240 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
245 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
246 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
247 ; VI-NEXT: s_waitcnt lgkmcnt(0)
248 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
249 ; VI-NEXT: v_mov_b32_e32 v0, s0
250 ; VI-NEXT: v_mov_b32_e32 v1, s1
251 ; VI-NEXT: v_mov_b32_e32 v2, s2
252 ; VI-NEXT: flat_store_dword v[0:1], v2
255 ; GFX9-LABEL: i16_arg:
257 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
258 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
259 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
262 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
263 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
264 ; GFX9-NEXT: s_endpgm
268 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
270 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
271 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
274 ; EG-NEXT: Fetch clause starting at 6:
275 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
276 ; EG-NEXT: ALU clause starting at 8:
277 ; EG-NEXT: MOV * T0.X, 0.0,
278 ; EG-NEXT: ALU clause starting at 9:
279 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
280 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
284 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
286 ; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
287 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
290 ; CM-NEXT: Fetch clause starting at 6:
291 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
292 ; CM-NEXT: ALU clause starting at 8:
293 ; CM-NEXT: MOV * T0.X, 0.0,
294 ; CM-NEXT: ALU clause starting at 9:
295 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
296 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
297 %ext = zext i16 %in to i32
298 store i32 %ext, ptr addrspace(1) %out, align 4
302 define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind {
303 ; SI-LABEL: i16_zext_arg:
305 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
306 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
307 ; SI-NEXT: s_mov_b32 s3, 0xf000
308 ; SI-NEXT: s_waitcnt lgkmcnt(0)
309 ; SI-NEXT: s_and_b32 s4, s2, 0xffff
310 ; SI-NEXT: s_mov_b32 s2, -1
311 ; SI-NEXT: v_mov_b32_e32 v0, s4
312 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
315 ; VI-LABEL: i16_zext_arg:
317 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
318 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
319 ; VI-NEXT: s_waitcnt lgkmcnt(0)
320 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
321 ; VI-NEXT: v_mov_b32_e32 v0, s0
322 ; VI-NEXT: v_mov_b32_e32 v1, s1
323 ; VI-NEXT: v_mov_b32_e32 v2, s2
324 ; VI-NEXT: flat_store_dword v[0:1], v2
327 ; GFX9-LABEL: i16_zext_arg:
329 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
330 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
331 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
332 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
333 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
334 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
335 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
336 ; GFX9-NEXT: s_endpgm
338 ; EG-LABEL: i16_zext_arg:
340 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
342 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
343 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
346 ; EG-NEXT: Fetch clause starting at 6:
347 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
348 ; EG-NEXT: ALU clause starting at 8:
349 ; EG-NEXT: MOV * T0.X, 0.0,
350 ; EG-NEXT: ALU clause starting at 9:
351 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
352 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
353 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
355 ; CM-LABEL: i16_zext_arg:
357 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
359 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
360 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
363 ; CM-NEXT: Fetch clause starting at 6:
364 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
365 ; CM-NEXT: ALU clause starting at 8:
366 ; CM-NEXT: MOV * T0.X, 0.0,
367 ; CM-NEXT: ALU clause starting at 9:
368 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
369 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
370 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
371 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
372 %ext = zext i16 %in to i32
373 store i32 %ext, ptr addrspace(1) %out, align 4
377 define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind {
378 ; SI-LABEL: i16_sext_arg:
380 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
381 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
382 ; SI-NEXT: s_mov_b32 s3, 0xf000
383 ; SI-NEXT: s_waitcnt lgkmcnt(0)
384 ; SI-NEXT: s_sext_i32_i16 s4, s2
385 ; SI-NEXT: s_mov_b32 s2, -1
386 ; SI-NEXT: v_mov_b32_e32 v0, s4
387 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
390 ; VI-LABEL: i16_sext_arg:
392 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
393 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
394 ; VI-NEXT: s_waitcnt lgkmcnt(0)
395 ; VI-NEXT: s_sext_i32_i16 s2, s2
396 ; VI-NEXT: v_mov_b32_e32 v0, s0
397 ; VI-NEXT: v_mov_b32_e32 v1, s1
398 ; VI-NEXT: v_mov_b32_e32 v2, s2
399 ; VI-NEXT: flat_store_dword v[0:1], v2
402 ; GFX9-LABEL: i16_sext_arg:
404 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
405 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
406 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
407 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
408 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
409 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
410 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
411 ; GFX9-NEXT: s_endpgm
413 ; EG-LABEL: i16_sext_arg:
415 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
417 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
418 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
421 ; EG-NEXT: Fetch clause starting at 6:
422 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
423 ; EG-NEXT: ALU clause starting at 8:
424 ; EG-NEXT: MOV * T0.X, 0.0,
425 ; EG-NEXT: ALU clause starting at 9:
426 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
427 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
428 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
430 ; CM-LABEL: i16_sext_arg:
432 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
434 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
435 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
438 ; CM-NEXT: Fetch clause starting at 6:
439 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
440 ; CM-NEXT: ALU clause starting at 8:
441 ; CM-NEXT: MOV * T0.X, 0.0,
442 ; CM-NEXT: ALU clause starting at 9:
443 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
444 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
445 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
446 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
447 %ext = sext i16 %in to i32
448 store i32 %ext, ptr addrspace(1) %out, align 4
452 define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind {
454 ; SI: ; %bb.0: ; %entry
455 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
456 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
457 ; SI-NEXT: s_mov_b32 s3, 0xf000
458 ; SI-NEXT: s_mov_b32 s2, -1
459 ; SI-NEXT: s_waitcnt lgkmcnt(0)
460 ; SI-NEXT: v_mov_b32_e32 v0, s4
461 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
465 ; VI: ; %bb.0: ; %entry
466 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
467 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
468 ; VI-NEXT: s_waitcnt lgkmcnt(0)
469 ; VI-NEXT: v_mov_b32_e32 v0, s2
470 ; VI-NEXT: v_mov_b32_e32 v1, s3
471 ; VI-NEXT: v_mov_b32_e32 v2, s0
472 ; VI-NEXT: flat_store_dword v[0:1], v2
475 ; GFX9-LABEL: i32_arg:
476 ; GFX9: ; %bb.0: ; %entry
477 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
478 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
479 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
480 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
482 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
483 ; GFX9-NEXT: s_endpgm
486 ; EG: ; %bb.0: ; %entry
487 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
488 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
491 ; EG-NEXT: ALU clause starting at 4:
492 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
493 ; EG-NEXT: MOV * T1.X, KC0[2].Z,
494 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
497 ; CM: ; %bb.0: ; %entry
498 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
499 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
502 ; CM-NEXT: ALU clause starting at 4:
503 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
504 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
505 ; CM-NEXT: MOV * T1.X, KC0[2].Z,
507 store i32 %in, ptr addrspace(1) %out, align 4
511 define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind {
513 ; SI: ; %bb.0: ; %entry
514 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
515 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
516 ; SI-NEXT: s_mov_b32 s3, 0xf000
517 ; SI-NEXT: s_mov_b32 s2, -1
518 ; SI-NEXT: s_waitcnt lgkmcnt(0)
519 ; SI-NEXT: v_mov_b32_e32 v0, s4
520 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
524 ; VI: ; %bb.0: ; %entry
525 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
526 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
527 ; VI-NEXT: s_waitcnt lgkmcnt(0)
528 ; VI-NEXT: v_mov_b32_e32 v0, s2
529 ; VI-NEXT: v_mov_b32_e32 v1, s3
530 ; VI-NEXT: v_mov_b32_e32 v2, s0
531 ; VI-NEXT: flat_store_dword v[0:1], v2
534 ; GFX9-LABEL: f32_arg:
535 ; GFX9: ; %bb.0: ; %entry
536 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
537 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
538 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
539 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
541 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
542 ; GFX9-NEXT: s_endpgm
545 ; EG: ; %bb.0: ; %entry
546 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
547 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
550 ; EG-NEXT: ALU clause starting at 4:
551 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
552 ; EG-NEXT: MOV * T1.X, KC0[2].Z,
553 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
556 ; CM: ; %bb.0: ; %entry
557 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
558 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
561 ; CM-NEXT: ALU clause starting at 4:
562 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
563 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
564 ; CM-NEXT: MOV * T1.X, KC0[2].Z,
566 store float %in, ptr addrspace(1) %out, align 4
570 define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
571 ; SI-LABEL: v2i8_arg:
572 ; SI: ; %bb.0: ; %entry
573 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
574 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
575 ; SI-NEXT: s_mov_b32 s3, 0xf000
576 ; SI-NEXT: s_mov_b32 s2, -1
577 ; SI-NEXT: s_waitcnt lgkmcnt(0)
578 ; SI-NEXT: v_mov_b32_e32 v0, s4
579 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
582 ; VI-LABEL: v2i8_arg:
583 ; VI: ; %bb.0: ; %entry
584 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
585 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
586 ; VI-NEXT: s_waitcnt lgkmcnt(0)
587 ; VI-NEXT: v_mov_b32_e32 v0, s2
588 ; VI-NEXT: v_mov_b32_e32 v1, s3
589 ; VI-NEXT: v_mov_b32_e32 v2, s0
590 ; VI-NEXT: flat_store_short v[0:1], v2
593 ; GFX9-LABEL: v2i8_arg:
594 ; GFX9: ; %bb.0: ; %entry
595 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
596 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
597 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
598 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
600 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
601 ; GFX9-NEXT: s_endpgm
603 ; EG-LABEL: v2i8_arg:
604 ; EG: ; %bb.0: ; %entry
605 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
607 ; EG-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[]
608 ; EG-NEXT: MEM_RAT MSKOR T4.XW, T5.X
611 ; EG-NEXT: Fetch clause starting at 6:
612 ; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
613 ; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
614 ; EG-NEXT: ALU clause starting at 10:
615 ; EG-NEXT: MOV * T4.X, 0.0,
616 ; EG-NEXT: ALU clause starting at 11:
617 ; EG-NEXT: LSHL T0.W, T5.X, literal.x,
618 ; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
619 ; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
620 ; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
621 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
622 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
623 ; EG-NEXT: AND_INT T0.W, PS, literal.x,
624 ; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
625 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
626 ; EG-NEXT: LSHL T4.X, PV.W, PS,
627 ; EG-NEXT: LSHL * T4.W, literal.x, PS,
628 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
629 ; EG-NEXT: MOV T4.Y, 0.0,
630 ; EG-NEXT: MOV * T4.Z, 0.0,
631 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
632 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
634 ; CM-LABEL: v2i8_arg:
635 ; CM: ; %bb.0: ; %entry
636 ; CM-NEXT: ALU 0, @10, KC0[], KC1[]
638 ; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[]
639 ; CM-NEXT: MEM_RAT MSKOR T4.XW, T5.X
642 ; CM-NEXT: Fetch clause starting at 6:
643 ; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
644 ; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
645 ; CM-NEXT: ALU clause starting at 10:
646 ; CM-NEXT: MOV * T4.X, 0.0,
647 ; CM-NEXT: ALU clause starting at 11:
648 ; CM-NEXT: LSHL T0.Z, T5.X, literal.x,
649 ; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
650 ; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43)
651 ; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x,
652 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
653 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
654 ; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
655 ; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
656 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
657 ; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
658 ; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
659 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
660 ; CM-NEXT: MOV T4.Y, 0.0,
661 ; CM-NEXT: MOV * T4.Z, 0.0,
662 ; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
663 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
665 store <2 x i8> %in, ptr addrspace(1) %out
669 define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
670 ; SI-LABEL: v2i16_arg:
671 ; SI: ; %bb.0: ; %entry
672 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
673 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
674 ; SI-NEXT: s_mov_b32 s3, 0xf000
675 ; SI-NEXT: s_mov_b32 s2, -1
676 ; SI-NEXT: s_waitcnt lgkmcnt(0)
677 ; SI-NEXT: v_mov_b32_e32 v0, s4
678 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
681 ; VI-LABEL: v2i16_arg:
682 ; VI: ; %bb.0: ; %entry
683 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
684 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
685 ; VI-NEXT: s_waitcnt lgkmcnt(0)
686 ; VI-NEXT: v_mov_b32_e32 v0, s2
687 ; VI-NEXT: v_mov_b32_e32 v1, s3
688 ; VI-NEXT: v_mov_b32_e32 v2, s0
689 ; VI-NEXT: flat_store_dword v[0:1], v2
692 ; GFX9-LABEL: v2i16_arg:
693 ; GFX9: ; %bb.0: ; %entry
694 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
695 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
696 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
697 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
698 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
699 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
700 ; GFX9-NEXT: s_endpgm
702 ; EG-LABEL: v2i16_arg:
703 ; EG: ; %bb.0: ; %entry
704 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
706 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
707 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
710 ; EG-NEXT: Fetch clause starting at 6:
711 ; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
712 ; EG-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3
713 ; EG-NEXT: ALU clause starting at 10:
714 ; EG-NEXT: MOV * T4.X, 0.0,
715 ; EG-NEXT: ALU clause starting at 11:
716 ; EG-NEXT: LSHL T0.W, T5.X, literal.x,
717 ; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
718 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
719 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
720 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
721 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
723 ; CM-LABEL: v2i16_arg:
724 ; CM: ; %bb.0: ; %entry
725 ; CM-NEXT: ALU 0, @10, KC0[], KC1[]
727 ; CM-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
728 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
731 ; CM-NEXT: Fetch clause starting at 6:
732 ; CM-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
733 ; CM-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3
734 ; CM-NEXT: ALU clause starting at 10:
735 ; CM-NEXT: MOV * T4.X, 0.0,
736 ; CM-NEXT: ALU clause starting at 11:
737 ; CM-NEXT: LSHL T0.Z, T5.X, literal.x,
738 ; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
739 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
740 ; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W,
741 ; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
742 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
744 store <2 x i16> %in, ptr addrspace(1) %out
748 define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind {
749 ; SI-LABEL: v2i32_arg:
750 ; SI: ; %bb.0: ; %entry
751 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
752 ; SI-NEXT: s_mov_b32 s7, 0xf000
753 ; SI-NEXT: s_mov_b32 s6, -1
754 ; SI-NEXT: s_waitcnt lgkmcnt(0)
755 ; SI-NEXT: s_mov_b32 s4, s0
756 ; SI-NEXT: s_mov_b32 s5, s1
757 ; SI-NEXT: v_mov_b32_e32 v0, s2
758 ; SI-NEXT: v_mov_b32_e32 v1, s3
759 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
762 ; VI-LABEL: v2i32_arg:
763 ; VI: ; %bb.0: ; %entry
764 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
765 ; VI-NEXT: s_waitcnt lgkmcnt(0)
766 ; VI-NEXT: v_mov_b32_e32 v0, s0
767 ; VI-NEXT: v_mov_b32_e32 v2, s2
768 ; VI-NEXT: v_mov_b32_e32 v1, s1
769 ; VI-NEXT: v_mov_b32_e32 v3, s3
770 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
773 ; GFX9-LABEL: v2i32_arg:
774 ; GFX9: ; %bb.0: ; %entry
775 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
776 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
777 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
778 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
779 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
780 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
781 ; GFX9-NEXT: s_endpgm
783 ; EG-LABEL: v2i32_arg:
784 ; EG: ; %bb.0: ; %entry
785 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
786 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
789 ; EG-NEXT: ALU clause starting at 4:
790 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
791 ; EG-NEXT: MOV T0.X, KC0[2].W,
792 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
793 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
795 ; CM-LABEL: v2i32_arg:
796 ; CM: ; %bb.0: ; %entry
797 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
798 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
801 ; CM-NEXT: ALU clause starting at 4:
802 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
803 ; CM-NEXT: MOV * T0.X, KC0[2].W,
804 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
805 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
807 store <2 x i32> %in, ptr addrspace(1) %out, align 4
811 define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind {
812 ; SI-LABEL: v2f32_arg:
813 ; SI: ; %bb.0: ; %entry
814 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
815 ; SI-NEXT: s_mov_b32 s7, 0xf000
816 ; SI-NEXT: s_mov_b32 s6, -1
817 ; SI-NEXT: s_waitcnt lgkmcnt(0)
818 ; SI-NEXT: s_mov_b32 s4, s0
819 ; SI-NEXT: s_mov_b32 s5, s1
820 ; SI-NEXT: v_mov_b32_e32 v0, s2
821 ; SI-NEXT: v_mov_b32_e32 v1, s3
822 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
825 ; VI-LABEL: v2f32_arg:
826 ; VI: ; %bb.0: ; %entry
827 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
828 ; VI-NEXT: s_waitcnt lgkmcnt(0)
829 ; VI-NEXT: v_mov_b32_e32 v0, s0
830 ; VI-NEXT: v_mov_b32_e32 v2, s2
831 ; VI-NEXT: v_mov_b32_e32 v1, s1
832 ; VI-NEXT: v_mov_b32_e32 v3, s3
833 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
836 ; GFX9-LABEL: v2f32_arg:
837 ; GFX9: ; %bb.0: ; %entry
838 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
839 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
840 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
841 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
842 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
843 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
844 ; GFX9-NEXT: s_endpgm
846 ; EG-LABEL: v2f32_arg:
847 ; EG: ; %bb.0: ; %entry
848 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
849 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
852 ; EG-NEXT: ALU clause starting at 4:
853 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
854 ; EG-NEXT: MOV T0.X, KC0[2].W,
855 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
856 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
858 ; CM-LABEL: v2f32_arg:
859 ; CM: ; %bb.0: ; %entry
860 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
861 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
864 ; CM-NEXT: ALU clause starting at 4:
865 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
866 ; CM-NEXT: MOV * T0.X, KC0[2].W,
867 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
868 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
870 store <2 x float> %in, ptr addrspace(1) %out, align 4
874 define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
875 ; SI-LABEL: v3i8_arg:
876 ; SI: ; %bb.0: ; %entry
877 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
878 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
879 ; SI-NEXT: s_mov_b32 s3, 0xf000
880 ; SI-NEXT: s_waitcnt lgkmcnt(0)
881 ; SI-NEXT: s_lshr_b32 s5, s4, 16
882 ; SI-NEXT: s_mov_b32 s2, -1
883 ; SI-NEXT: v_mov_b32_e32 v0, s4
884 ; SI-NEXT: v_mov_b32_e32 v1, s5
885 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
886 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
889 ; VI-LABEL: v3i8_arg:
890 ; VI: ; %bb.0: ; %entry
891 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
892 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
893 ; VI-NEXT: s_waitcnt lgkmcnt(0)
894 ; VI-NEXT: s_lshr_b32 s3, s2, 16
895 ; VI-NEXT: v_mov_b32_e32 v0, s0
896 ; VI-NEXT: v_mov_b32_e32 v1, s1
897 ; VI-NEXT: s_add_u32 s0, s0, 2
898 ; VI-NEXT: s_addc_u32 s1, s1, 0
899 ; VI-NEXT: v_mov_b32_e32 v3, s1
900 ; VI-NEXT: v_mov_b32_e32 v5, s3
901 ; VI-NEXT: v_mov_b32_e32 v2, s0
902 ; VI-NEXT: v_mov_b32_e32 v4, s2
903 ; VI-NEXT: flat_store_byte v[2:3], v5
904 ; VI-NEXT: flat_store_short v[0:1], v4
907 ; GFX9-LABEL: v3i8_arg:
908 ; GFX9: ; %bb.0: ; %entry
909 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
910 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
911 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
912 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
913 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
914 ; GFX9-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2
915 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
916 ; GFX9-NEXT: s_endpgm
918 ; EG-LABEL: v3i8_arg:
919 ; EG: ; %bb.0: ; %entry
920 ; EG-NEXT: ALU 0, @12, KC0[], KC1[]
922 ; EG-NEXT: ALU 28, @13, KC0[CB0:0-32], KC1[]
923 ; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X
924 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
926 ; EG-NEXT: Fetch clause starting at 6:
927 ; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
928 ; EG-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3
929 ; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
930 ; EG-NEXT: ALU clause starting at 12:
931 ; EG-NEXT: MOV * T4.X, 0.0,
932 ; EG-NEXT: ALU clause starting at 13:
933 ; EG-NEXT: LSHL T0.W, T5.X, literal.x,
934 ; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
935 ; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
936 ; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
937 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
938 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
939 ; EG-NEXT: AND_INT T0.W, PS, literal.x,
940 ; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
941 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
942 ; EG-NEXT: LSHL T4.X, PV.W, PS,
943 ; EG-NEXT: LSHL * T4.W, literal.x, PS,
944 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
945 ; EG-NEXT: MOV T4.Y, 0.0,
946 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
947 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
948 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
949 ; EG-NEXT: AND_INT * T2.W, T6.X, literal.y,
950 ; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
951 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
952 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
953 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
954 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
955 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
956 ; EG-NEXT: MOV T5.Y, 0.0,
957 ; EG-NEXT: MOV T4.Z, 0.0,
958 ; EG-NEXT: MOV * T5.Z, 0.0,
959 ; EG-NEXT: LSHR T6.X, T0.W, literal.x,
960 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
961 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
963 ; CM-LABEL: v3i8_arg:
964 ; CM: ; %bb.0: ; %entry
965 ; CM-NEXT: ALU 0, @12, KC0[], KC1[]
967 ; CM-NEXT: ALU 29, @13, KC0[CB0:0-32], KC1[]
968 ; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X
969 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T6.X
971 ; CM-NEXT: Fetch clause starting at 6:
972 ; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3
973 ; CM-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3
974 ; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3
975 ; CM-NEXT: ALU clause starting at 12:
976 ; CM-NEXT: MOV * T4.X, 0.0,
977 ; CM-NEXT: ALU clause starting at 13:
978 ; CM-NEXT: LSHL T0.Z, T5.X, literal.x,
979 ; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
980 ; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43)
981 ; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x,
982 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
983 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
984 ; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
985 ; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
986 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
987 ; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
988 ; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
989 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
990 ; CM-NEXT: MOV T4.Y, 0.0,
991 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
992 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
993 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
994 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
995 ; CM-NEXT: AND_INT T0.Z, T6.X, literal.x,
996 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
997 ; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
998 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
999 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
1000 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1001 ; CM-NEXT: MOV T5.Y, 0.0,
1002 ; CM-NEXT: MOV * T4.Z, 0.0,
1003 ; CM-NEXT: MOV * T5.Z, 0.0,
1004 ; CM-NEXT: LSHR * T6.X, T0.W, literal.x,
1005 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1006 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1007 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1009 store <3 x i8> %in, ptr addrspace(1) %out, align 4
1013 define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
1014 ; SI-LABEL: v3i16_arg:
1015 ; SI: ; %bb.0: ; %entry
1016 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1017 ; SI-NEXT: s_mov_b32 s7, 0xf000
1018 ; SI-NEXT: s_mov_b32 s6, -1
1019 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1020 ; SI-NEXT: s_mov_b32 s4, s0
1021 ; SI-NEXT: s_mov_b32 s5, s1
1022 ; SI-NEXT: v_mov_b32_e32 v0, s3
1023 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
1024 ; SI-NEXT: s_waitcnt expcnt(0)
1025 ; SI-NEXT: v_mov_b32_e32 v0, s2
1026 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1029 ; VI-LABEL: v3i16_arg:
1030 ; VI: ; %bb.0: ; %entry
1031 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1032 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1033 ; VI-NEXT: s_add_u32 s4, s0, 4
1034 ; VI-NEXT: s_addc_u32 s5, s1, 0
1035 ; VI-NEXT: v_mov_b32_e32 v2, s4
1036 ; VI-NEXT: v_mov_b32_e32 v4, s3
1037 ; VI-NEXT: v_mov_b32_e32 v0, s0
1038 ; VI-NEXT: v_mov_b32_e32 v3, s5
1039 ; VI-NEXT: v_mov_b32_e32 v1, s1
1040 ; VI-NEXT: v_mov_b32_e32 v5, s2
1041 ; VI-NEXT: flat_store_short v[2:3], v4
1042 ; VI-NEXT: flat_store_dword v[0:1], v5
1045 ; GFX9-LABEL: v3i16_arg:
1046 ; GFX9: ; %bb.0: ; %entry
1047 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1048 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1049 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1050 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1051 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1052 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4
1053 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
1054 ; GFX9-NEXT: s_endpgm
1056 ; EG-LABEL: v3i16_arg:
1057 ; EG: ; %bb.0: ; %entry
1058 ; EG-NEXT: ALU 0, @12, KC0[], KC1[]
1060 ; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
1061 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
1062 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
1064 ; EG-NEXT: Fetch clause starting at 6:
1065 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
1066 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
1067 ; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
1068 ; EG-NEXT: ALU clause starting at 12:
1069 ; EG-NEXT: MOV * T5.X, 0.0,
1070 ; EG-NEXT: ALU clause starting at 13:
1071 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1072 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1073 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
1074 ; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
1075 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1076 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1077 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1078 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
1079 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
1080 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1081 ; EG-NEXT: MOV T5.Y, 0.0,
1082 ; EG-NEXT: MOV * T5.Z, 0.0,
1083 ; EG-NEXT: LSHR T8.X, T0.W, literal.x,
1084 ; EG-NEXT: LSHL T0.W, T7.X, literal.y,
1085 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
1086 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
1087 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1088 ; EG-NEXT: OR_INT T6.X, PV.W, PS,
1089 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1090 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1092 ; CM-LABEL: v3i16_arg:
1093 ; CM: ; %bb.0: ; %entry
1094 ; CM-NEXT: ALU 0, @12, KC0[], KC1[]
1096 ; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
1097 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X
1098 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
1100 ; CM-NEXT: Fetch clause starting at 6:
1101 ; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
1102 ; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3
1103 ; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3
1104 ; CM-NEXT: ALU clause starting at 12:
1105 ; CM-NEXT: MOV * T5.X, 0.0,
1106 ; CM-NEXT: ALU clause starting at 13:
1107 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1108 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1109 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
1110 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1111 ; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
1112 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
1113 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1114 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
1115 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
1116 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1117 ; CM-NEXT: MOV T5.Y, 0.0,
1118 ; CM-NEXT: MOV * T5.Z, 0.0,
1119 ; CM-NEXT: LSHL T0.Z, T7.X, literal.x,
1120 ; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
1121 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
1122 ; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
1123 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1124 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1125 ; CM-NEXT: LSHR * T8.X, T0.W, literal.x,
1126 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1128 store <3 x i16> %in, ptr addrspace(1) %out, align 4
1132 define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
1133 ; SI-LABEL: v3i32_arg:
1134 ; SI: ; %bb.0: ; %entry
1135 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
1136 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1137 ; SI-NEXT: s_mov_b32 s3, 0xf000
1138 ; SI-NEXT: s_mov_b32 s2, -1
1139 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1140 ; SI-NEXT: v_mov_b32_e32 v0, s6
1141 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
1142 ; SI-NEXT: s_waitcnt expcnt(0)
1143 ; SI-NEXT: v_mov_b32_e32 v0, s4
1144 ; SI-NEXT: v_mov_b32_e32 v1, s5
1145 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1148 ; VI-LABEL: v3i32_arg:
1149 ; VI: ; %bb.0: ; %entry
1150 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
1151 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1152 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1153 ; VI-NEXT: v_mov_b32_e32 v0, s4
1154 ; VI-NEXT: v_mov_b32_e32 v4, s1
1155 ; VI-NEXT: v_mov_b32_e32 v1, s5
1156 ; VI-NEXT: v_mov_b32_e32 v2, s6
1157 ; VI-NEXT: v_mov_b32_e32 v3, s0
1158 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
1161 ; GFX9-LABEL: v3i32_arg:
1162 ; GFX9: ; %bb.0: ; %entry
1163 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
1164 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1165 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1166 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1167 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1168 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1169 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1170 ; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
1171 ; GFX9-NEXT: s_endpgm
1173 ; EG-LABEL: v3i32_arg:
1174 ; EG: ; %bb.0: ; %entry
1175 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1176 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1177 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1179 ; EG-NEXT: ALU clause starting at 4:
1180 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1181 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1182 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1183 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1184 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1185 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1186 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1187 ; EG-NEXT: MOV * T3.X, KC0[3].W,
1188 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1190 ; CM-LABEL: v3i32_arg:
1191 ; CM: ; %bb.0: ; %entry
1192 ; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1193 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1194 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1196 ; CM-NEXT: ALU clause starting at 4:
1197 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1198 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1199 ; CM-NEXT: LSHR * T0.X, PV.W, literal.x,
1200 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1201 ; CM-NEXT: MOV T1.X, KC0[3].W,
1202 ; CM-NEXT: MOV * T2.Y, KC0[3].Z,
1203 ; CM-NEXT: MOV * T2.X, KC0[3].Y,
1204 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
1205 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1207 store <3 x i32> %in, ptr addrspace(1) %out, align 4
1211 define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
1212 ; SI-LABEL: v3f32_arg:
1213 ; SI: ; %bb.0: ; %entry
1214 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
1215 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1216 ; SI-NEXT: s_mov_b32 s3, 0xf000
1217 ; SI-NEXT: s_mov_b32 s2, -1
1218 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1219 ; SI-NEXT: v_mov_b32_e32 v0, s6
1220 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
1221 ; SI-NEXT: s_waitcnt expcnt(0)
1222 ; SI-NEXT: v_mov_b32_e32 v0, s4
1223 ; SI-NEXT: v_mov_b32_e32 v1, s5
1224 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1227 ; VI-LABEL: v3f32_arg:
1228 ; VI: ; %bb.0: ; %entry
1229 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
1230 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1231 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1232 ; VI-NEXT: v_mov_b32_e32 v0, s4
1233 ; VI-NEXT: v_mov_b32_e32 v4, s1
1234 ; VI-NEXT: v_mov_b32_e32 v1, s5
1235 ; VI-NEXT: v_mov_b32_e32 v2, s6
1236 ; VI-NEXT: v_mov_b32_e32 v3, s0
1237 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
1240 ; GFX9-LABEL: v3f32_arg:
1241 ; GFX9: ; %bb.0: ; %entry
1242 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
1243 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1244 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1245 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1246 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1247 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1248 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1249 ; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
1250 ; GFX9-NEXT: s_endpgm
1252 ; EG-LABEL: v3f32_arg:
1253 ; EG: ; %bb.0: ; %entry
1254 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1255 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1256 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1258 ; EG-NEXT: ALU clause starting at 4:
1259 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1260 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1261 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1262 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1263 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1264 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1265 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1266 ; EG-NEXT: MOV * T3.X, KC0[3].W,
1267 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1269 ; CM-LABEL: v3f32_arg:
1270 ; CM: ; %bb.0: ; %entry
1271 ; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1272 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1273 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1275 ; CM-NEXT: ALU clause starting at 4:
1276 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1277 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1278 ; CM-NEXT: LSHR * T0.X, PV.W, literal.x,
1279 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1280 ; CM-NEXT: MOV T1.X, KC0[3].W,
1281 ; CM-NEXT: MOV * T2.Y, KC0[3].Z,
1282 ; CM-NEXT: MOV * T2.X, KC0[3].Y,
1283 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
1284 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1286 store <3 x float> %in, ptr addrspace(1) %out, align 4
1290 define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
1291 ; SI-LABEL: v4i8_arg:
1292 ; SI: ; %bb.0: ; %entry
1293 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
1294 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1295 ; SI-NEXT: s_mov_b32 s3, 0xf000
1296 ; SI-NEXT: s_mov_b32 s2, -1
1297 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1298 ; SI-NEXT: v_mov_b32_e32 v0, s4
1299 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1302 ; VI-LABEL: v4i8_arg:
1303 ; VI: ; %bb.0: ; %entry
1304 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1305 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
1306 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1307 ; VI-NEXT: v_mov_b32_e32 v0, s2
1308 ; VI-NEXT: v_mov_b32_e32 v1, s3
1309 ; VI-NEXT: v_mov_b32_e32 v2, s0
1310 ; VI-NEXT: flat_store_dword v[0:1], v2
1313 ; GFX9-LABEL: v4i8_arg:
1314 ; GFX9: ; %bb.0: ; %entry
1315 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
1316 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1317 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1318 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1319 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1320 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1321 ; GFX9-NEXT: s_endpgm
1323 ; EG-LABEL: v4i8_arg:
1324 ; EG: ; %bb.0: ; %entry
1325 ; EG-NEXT: ALU 0, @14, KC0[], KC1[]
1327 ; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
1328 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
1331 ; EG-NEXT: Fetch clause starting at 6:
1332 ; EG-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3
1333 ; EG-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3
1334 ; EG-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3
1335 ; EG-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3
1336 ; EG-NEXT: ALU clause starting at 14:
1337 ; EG-NEXT: MOV * T4.X, 0.0,
1338 ; EG-NEXT: ALU clause starting at 15:
1339 ; EG-NEXT: AND_INT * T0.W, T5.X, literal.x,
1340 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1341 ; EG-NEXT: AND_INT T0.Z, T4.X, literal.x,
1342 ; EG-NEXT: LSHL T0.W, PV.W, literal.y,
1343 ; EG-NEXT: LSHL * T1.W, T7.X, literal.z,
1344 ; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
1345 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1346 ; EG-NEXT: OR_INT T0.W, PS, PV.W,
1347 ; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
1348 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1349 ; EG-NEXT: OR_INT T0.W, PV.W, PS,
1350 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.x,
1351 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1352 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
1353 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
1354 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1356 ; CM-LABEL: v4i8_arg:
1357 ; CM: ; %bb.0: ; %entry
1358 ; CM-NEXT: ALU 0, @14, KC0[], KC1[]
1360 ; CM-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
1361 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
1364 ; CM-NEXT: Fetch clause starting at 6:
1365 ; CM-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3
1366 ; CM-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3
1367 ; CM-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3
1368 ; CM-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3
1369 ; CM-NEXT: ALU clause starting at 14:
1370 ; CM-NEXT: MOV * T4.X, 0.0,
1371 ; CM-NEXT: ALU clause starting at 15:
1372 ; CM-NEXT: AND_INT * T0.W, T5.X, literal.x,
1373 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1374 ; CM-NEXT: AND_INT T0.Y, T4.X, literal.x,
1375 ; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
1376 ; CM-NEXT: LSHL * T0.W, T7.X, literal.z, BS:VEC_120/SCL_212
1377 ; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
1378 ; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1379 ; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
1380 ; CM-NEXT: LSHL * T0.W, PV.Y, literal.x,
1381 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1382 ; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
1383 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
1384 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1385 ; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W,
1386 ; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
1387 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1389 store <4 x i8> %in, ptr addrspace(1) %out
1393 define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
1394 ; SI-LABEL: v4i16_arg:
1395 ; SI: ; %bb.0: ; %entry
1396 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1397 ; SI-NEXT: s_mov_b32 s7, 0xf000
1398 ; SI-NEXT: s_mov_b32 s6, -1
1399 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1400 ; SI-NEXT: s_mov_b32 s4, s0
1401 ; SI-NEXT: s_mov_b32 s5, s1
1402 ; SI-NEXT: v_mov_b32_e32 v0, s2
1403 ; SI-NEXT: v_mov_b32_e32 v1, s3
1404 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1407 ; VI-LABEL: v4i16_arg:
1408 ; VI: ; %bb.0: ; %entry
1409 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1410 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1411 ; VI-NEXT: v_mov_b32_e32 v0, s0
1412 ; VI-NEXT: v_mov_b32_e32 v2, s2
1413 ; VI-NEXT: v_mov_b32_e32 v1, s1
1414 ; VI-NEXT: v_mov_b32_e32 v3, s3
1415 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1418 ; GFX9-LABEL: v4i16_arg:
1419 ; GFX9: ; %bb.0: ; %entry
1420 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1421 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1422 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1423 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1424 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1425 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1426 ; GFX9-NEXT: s_endpgm
1428 ; EG-LABEL: v4i16_arg:
1429 ; EG: ; %bb.0: ; %entry
1430 ; EG-NEXT: ALU 1, @20, KC0[], KC1[]
1431 ; EG-NEXT: TEX 0 @12
1432 ; EG-NEXT: ALU 5, @22, KC0[], KC1[]
1433 ; EG-NEXT: TEX 0 @14
1434 ; EG-NEXT: ALU 5, @28, KC0[], KC1[]
1435 ; EG-NEXT: TEX 0 @16
1436 ; EG-NEXT: ALU 5, @34, KC0[], KC1[]
1437 ; EG-NEXT: TEX 0 @18
1438 ; EG-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[]
1439 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
1442 ; EG-NEXT: Fetch clause starting at 12:
1443 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
1444 ; EG-NEXT: Fetch clause starting at 14:
1445 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
1446 ; EG-NEXT: Fetch clause starting at 16:
1447 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
1448 ; EG-NEXT: Fetch clause starting at 18:
1449 ; EG-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3
1450 ; EG-NEXT: ALU clause starting at 20:
1451 ; EG-NEXT: MOV * T0.Y, T3.X,
1452 ; EG-NEXT: MOV * T5.X, 0.0,
1453 ; EG-NEXT: ALU clause starting at 22:
1454 ; EG-NEXT: LSHL T0.W, T6.X, literal.x,
1455 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
1456 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
1457 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
1458 ; EG-NEXT: MOV * T3.X, PV.W,
1459 ; EG-NEXT: MOV * T0.Y, PV.X,
1460 ; EG-NEXT: ALU clause starting at 28:
1461 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
1462 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.y,
1463 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
1464 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
1465 ; EG-NEXT: MOV T3.X, PV.W,
1466 ; EG-NEXT: MOV * T0.Y, T2.X,
1467 ; EG-NEXT: ALU clause starting at 34:
1468 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
1469 ; EG-NEXT: LSHL * T1.W, T6.X, literal.y,
1470 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
1471 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
1472 ; EG-NEXT: MOV * T2.X, PV.W,
1473 ; EG-NEXT: MOV * T0.Y, PV.X,
1474 ; EG-NEXT: ALU clause starting at 40:
1475 ; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
1476 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
1477 ; EG-NEXT: AND_INT * T1.W, T5.X, literal.z,
1478 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
1479 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1480 ; EG-NEXT: OR_INT * T5.X, PV.W, PS,
1481 ; EG-NEXT: MOV T2.X, PV.X,
1482 ; EG-NEXT: MOV * T5.Y, T3.X,
1484 ; CM-LABEL: v4i16_arg:
1485 ; CM: ; %bb.0: ; %entry
1486 ; CM-NEXT: ALU 1, @20, KC0[], KC1[]
1487 ; CM-NEXT: TEX 0 @12
1488 ; CM-NEXT: ALU 5, @22, KC0[], KC1[]
1489 ; CM-NEXT: TEX 0 @14
1490 ; CM-NEXT: ALU 5, @28, KC0[], KC1[]
1491 ; CM-NEXT: TEX 0 @16
1492 ; CM-NEXT: ALU 5, @34, KC0[], KC1[]
1493 ; CM-NEXT: TEX 0 @18
1494 ; CM-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[]
1495 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
1498 ; CM-NEXT: Fetch clause starting at 12:
1499 ; CM-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
1500 ; CM-NEXT: Fetch clause starting at 14:
1501 ; CM-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
1502 ; CM-NEXT: Fetch clause starting at 16:
1503 ; CM-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
1504 ; CM-NEXT: Fetch clause starting at 18:
1505 ; CM-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3
1506 ; CM-NEXT: ALU clause starting at 20:
1507 ; CM-NEXT: MOV * T0.Y, T3.X,
1508 ; CM-NEXT: MOV * T5.X, 0.0,
1509 ; CM-NEXT: ALU clause starting at 22:
1510 ; CM-NEXT: LSHL T0.Z, T6.X, literal.x,
1511 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
1512 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
1513 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
1514 ; CM-NEXT: MOV * T3.X, PV.W,
1515 ; CM-NEXT: MOV * T0.Y, PV.X,
1516 ; CM-NEXT: ALU clause starting at 28:
1517 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
1518 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
1519 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
1520 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
1521 ; CM-NEXT: MOV T3.X, PV.W,
1522 ; CM-NEXT: MOV * T0.Y, T2.X,
1523 ; CM-NEXT: ALU clause starting at 34:
1524 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
1525 ; CM-NEXT: LSHL * T0.W, T6.X, literal.y,
1526 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
1527 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
1528 ; CM-NEXT: MOV * T2.X, PV.W,
1529 ; CM-NEXT: MOV * T0.Y, PV.X,
1530 ; CM-NEXT: ALU clause starting at 40:
1531 ; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
1532 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
1533 ; CM-NEXT: AND_INT * T0.W, T5.X, literal.z,
1534 ; CM-NEXT: 2(2.802597e-45), -65536(nan)
1535 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1536 ; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W,
1537 ; CM-NEXT: MOV T2.X, PV.X,
1538 ; CM-NEXT: MOV * T5.Y, T3.X,
1540 store <4 x i16> %in, ptr addrspace(1) %out
1544 define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind {
1545 ; SI-LABEL: v4i32_arg:
1546 ; SI: ; %bb.0: ; %entry
1547 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
1548 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1549 ; SI-NEXT: s_mov_b32 s3, 0xf000
1550 ; SI-NEXT: s_mov_b32 s2, -1
1551 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1552 ; SI-NEXT: v_mov_b32_e32 v0, s4
1553 ; SI-NEXT: v_mov_b32_e32 v1, s5
1554 ; SI-NEXT: v_mov_b32_e32 v2, s6
1555 ; SI-NEXT: v_mov_b32_e32 v3, s7
1556 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1559 ; VI-LABEL: v4i32_arg:
1560 ; VI: ; %bb.0: ; %entry
1561 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1562 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
1563 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1564 ; VI-NEXT: v_mov_b32_e32 v4, s4
1565 ; VI-NEXT: v_mov_b32_e32 v0, s0
1566 ; VI-NEXT: v_mov_b32_e32 v5, s5
1567 ; VI-NEXT: v_mov_b32_e32 v1, s1
1568 ; VI-NEXT: v_mov_b32_e32 v2, s2
1569 ; VI-NEXT: v_mov_b32_e32 v3, s3
1570 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1573 ; GFX9-LABEL: v4i32_arg:
1574 ; GFX9: ; %bb.0: ; %entry
1575 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
1576 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1577 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1578 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1579 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1580 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1581 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1582 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1583 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
1584 ; GFX9-NEXT: s_endpgm
1586 ; EG-LABEL: v4i32_arg:
1587 ; EG: ; %bb.0: ; %entry
1588 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1589 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1592 ; EG-NEXT: ALU clause starting at 4:
1593 ; EG-NEXT: MOV * T0.W, KC0[4].X,
1594 ; EG-NEXT: MOV * T0.Z, KC0[3].W,
1595 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1596 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1597 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1598 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1600 ; CM-LABEL: v4i32_arg:
1601 ; CM: ; %bb.0: ; %entry
1602 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1603 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1606 ; CM-NEXT: ALU clause starting at 4:
1607 ; CM-NEXT: MOV * T0.W, KC0[4].X,
1608 ; CM-NEXT: MOV * T0.Z, KC0[3].W,
1609 ; CM-NEXT: MOV * T0.Y, KC0[3].Z,
1610 ; CM-NEXT: MOV * T0.X, KC0[3].Y,
1611 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1612 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1614 store <4 x i32> %in, ptr addrspace(1) %out, align 4
1618 define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind {
1619 ; SI-LABEL: v4f32_arg:
1620 ; SI: ; %bb.0: ; %entry
1621 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
1622 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1623 ; SI-NEXT: s_mov_b32 s3, 0xf000
1624 ; SI-NEXT: s_mov_b32 s2, -1
1625 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1626 ; SI-NEXT: v_mov_b32_e32 v0, s4
1627 ; SI-NEXT: v_mov_b32_e32 v1, s5
1628 ; SI-NEXT: v_mov_b32_e32 v2, s6
1629 ; SI-NEXT: v_mov_b32_e32 v3, s7
1630 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1633 ; VI-LABEL: v4f32_arg:
1634 ; VI: ; %bb.0: ; %entry
1635 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1636 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
1637 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1638 ; VI-NEXT: v_mov_b32_e32 v4, s4
1639 ; VI-NEXT: v_mov_b32_e32 v0, s0
1640 ; VI-NEXT: v_mov_b32_e32 v5, s5
1641 ; VI-NEXT: v_mov_b32_e32 v1, s1
1642 ; VI-NEXT: v_mov_b32_e32 v2, s2
1643 ; VI-NEXT: v_mov_b32_e32 v3, s3
1644 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1647 ; GFX9-LABEL: v4f32_arg:
1648 ; GFX9: ; %bb.0: ; %entry
1649 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
1650 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1651 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1652 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1653 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1654 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1655 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1656 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1657 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
1658 ; GFX9-NEXT: s_endpgm
1660 ; EG-LABEL: v4f32_arg:
1661 ; EG: ; %bb.0: ; %entry
1662 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1663 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1666 ; EG-NEXT: ALU clause starting at 4:
1667 ; EG-NEXT: MOV * T0.W, KC0[4].X,
1668 ; EG-NEXT: MOV * T0.Z, KC0[3].W,
1669 ; EG-NEXT: MOV * T0.Y, KC0[3].Z,
1670 ; EG-NEXT: MOV T0.X, KC0[3].Y,
1671 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1672 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1674 ; CM-LABEL: v4f32_arg:
1675 ; CM: ; %bb.0: ; %entry
1676 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1677 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1680 ; CM-NEXT: ALU clause starting at 4:
1681 ; CM-NEXT: MOV * T0.W, KC0[4].X,
1682 ; CM-NEXT: MOV * T0.Z, KC0[3].W,
1683 ; CM-NEXT: MOV * T0.Y, KC0[3].Z,
1684 ; CM-NEXT: MOV * T0.X, KC0[3].Y,
1685 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1686 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1688 store <4 x float> %in, ptr addrspace(1) %out, align 4
1692 define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
1693 ; SI-LABEL: v5i8_arg:
1694 ; SI: ; %bb.0: ; %entry
1695 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1696 ; SI-NEXT: s_mov_b32 s7, 0xf000
1697 ; SI-NEXT: s_mov_b32 s6, -1
1698 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1699 ; SI-NEXT: s_mov_b32 s4, s0
1700 ; SI-NEXT: s_mov_b32 s5, s1
1701 ; SI-NEXT: v_mov_b32_e32 v0, s3
1702 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4
1703 ; SI-NEXT: s_waitcnt expcnt(0)
1704 ; SI-NEXT: v_mov_b32_e32 v0, s2
1705 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1708 ; VI-LABEL: v5i8_arg:
1709 ; VI: ; %bb.0: ; %entry
1710 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1711 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1712 ; VI-NEXT: s_add_u32 s4, s0, 4
1713 ; VI-NEXT: s_addc_u32 s5, s1, 0
1714 ; VI-NEXT: v_mov_b32_e32 v2, s4
1715 ; VI-NEXT: v_mov_b32_e32 v4, s3
1716 ; VI-NEXT: v_mov_b32_e32 v0, s0
1717 ; VI-NEXT: v_mov_b32_e32 v3, s5
1718 ; VI-NEXT: v_mov_b32_e32 v1, s1
1719 ; VI-NEXT: v_mov_b32_e32 v5, s2
1720 ; VI-NEXT: flat_store_byte v[2:3], v4
1721 ; VI-NEXT: flat_store_dword v[0:1], v5
1724 ; GFX9-LABEL: v5i8_arg:
1725 ; GFX9: ; %bb.0: ; %entry
1726 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1727 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1728 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1729 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1730 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1731 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:4
1732 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
1733 ; GFX9-NEXT: s_endpgm
1735 ; EG-LABEL: v5i8_arg:
1736 ; EG: ; %bb.0: ; %entry
1737 ; EG-NEXT: ALU 0, @16, KC0[], KC1[]
1739 ; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
1740 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
1741 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
1743 ; EG-NEXT: Fetch clause starting at 6:
1744 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
1745 ; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
1746 ; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
1747 ; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
1748 ; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
1749 ; EG-NEXT: ALU clause starting at 16:
1750 ; EG-NEXT: MOV * T5.X, 0.0,
1751 ; EG-NEXT: ALU clause starting at 17:
1752 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1753 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1754 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
1755 ; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
1756 ; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
1757 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1758 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1759 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
1760 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
1761 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1762 ; EG-NEXT: MOV T5.Y, 0.0,
1763 ; EG-NEXT: MOV T5.Z, 0.0,
1764 ; EG-NEXT: AND_INT T1.W, T9.X, literal.x,
1765 ; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x,
1766 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1767 ; EG-NEXT: LSHL T1.W, PV.W, literal.x,
1768 ; EG-NEXT: LSHL * T2.W, T7.X, literal.y,
1769 ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
1770 ; EG-NEXT: OR_INT T1.W, PS, PV.W,
1771 ; EG-NEXT: LSHL * T2.W, T0.Z, literal.x,
1772 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1773 ; EG-NEXT: OR_INT T1.W, PV.W, PS,
1774 ; EG-NEXT: AND_INT * T2.W, T6.X, literal.x,
1775 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1776 ; EG-NEXT: OR_INT T6.X, PV.W, PS,
1777 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1778 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1779 ; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
1780 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1782 ; CM-LABEL: v5i8_arg:
1783 ; CM: ; %bb.0: ; %entry
1784 ; CM-NEXT: ALU 0, @16, KC0[], KC1[]
1786 ; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
1787 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X
1788 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X
1790 ; CM-NEXT: Fetch clause starting at 6:
1791 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
1792 ; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
1793 ; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
1794 ; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
1795 ; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
1796 ; CM-NEXT: ALU clause starting at 16:
1797 ; CM-NEXT: MOV * T5.X, 0.0,
1798 ; CM-NEXT: ALU clause starting at 17:
1799 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1800 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1801 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
1802 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1803 ; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
1804 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
1805 ; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1806 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
1807 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
1808 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1809 ; CM-NEXT: MOV T5.Y, 0.0,
1810 ; CM-NEXT: MOV T5.Z, 0.0,
1811 ; CM-NEXT: AND_INT * T1.W, T9.X, literal.x,
1812 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1813 ; CM-NEXT: AND_INT T0.Y, T8.X, literal.x,
1814 ; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
1815 ; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212
1816 ; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
1817 ; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1818 ; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
1819 ; CM-NEXT: LSHL * T1.W, PV.Y, literal.x,
1820 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1821 ; CM-NEXT: LSHR T7.X, T0.W, literal.x,
1822 ; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
1823 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
1824 ; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43)
1825 ; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
1826 ; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
1827 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1829 store <5 x i8> %in, ptr addrspace(1) %out, align 4
1833 define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind {
1834 ; SI-LABEL: v5i16_arg:
1835 ; SI: ; %bb.0: ; %entry
1836 ; SI-NEXT: s_load_dword s2, s[0:1], 0xf
1837 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1838 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1839 ; SI-NEXT: s_mov_b32 s7, 0xf000
1840 ; SI-NEXT: s_mov_b32 s6, -1
1841 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1842 ; SI-NEXT: v_mov_b32_e32 v0, s2
1843 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8
1844 ; SI-NEXT: s_waitcnt expcnt(0)
1845 ; SI-NEXT: v_mov_b32_e32 v0, s0
1846 ; SI-NEXT: v_mov_b32_e32 v1, s1
1847 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1850 ; VI-LABEL: v5i16_arg:
1851 ; VI: ; %bb.0: ; %entry
1852 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1853 ; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
1854 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1855 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1856 ; VI-NEXT: s_add_u32 s4, s2, 8
1857 ; VI-NEXT: v_mov_b32_e32 v4, s5
1858 ; VI-NEXT: s_addc_u32 s5, s3, 0
1859 ; VI-NEXT: v_mov_b32_e32 v2, s4
1860 ; VI-NEXT: v_mov_b32_e32 v3, s5
1861 ; VI-NEXT: v_mov_b32_e32 v0, s2
1862 ; VI-NEXT: flat_store_short v[2:3], v4
1863 ; VI-NEXT: v_mov_b32_e32 v3, s1
1864 ; VI-NEXT: v_mov_b32_e32 v1, s3
1865 ; VI-NEXT: v_mov_b32_e32 v2, s0
1866 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1869 ; GFX9-LABEL: v5i16_arg:
1870 ; GFX9: ; %bb.0: ; %entry
1871 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
1872 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1873 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1874 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1875 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
1876 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1877 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1878 ; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8
1879 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1880 ; GFX9-NEXT: s_endpgm
1882 ; EG-LABEL: v5i16_arg:
1883 ; EG: ; %bb.0: ; %entry
1884 ; EG-NEXT: ALU 0, @20, KC0[], KC1[]
1885 ; EG-NEXT: TEX 4 @10
1886 ; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[]
1887 ; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X
1888 ; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X
1889 ; EG-NEXT: MEM_RAT MSKOR T3.XW, T2.X
1890 ; EG-NEXT: MEM_RAT MSKOR T6.XW, T1.X
1891 ; EG-NEXT: MEM_RAT MSKOR T8.XW, T0.X
1894 ; EG-NEXT: Fetch clause starting at 10:
1895 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3
1896 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3
1897 ; EG-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3
1898 ; EG-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3
1899 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
1900 ; EG-NEXT: ALU clause starting at 20:
1901 ; EG-NEXT: MOV * T0.X, 0.0,
1902 ; EG-NEXT: ALU clause starting at 21:
1903 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1904 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1905 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
1906 ; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
1907 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1908 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1909 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1910 ; EG-NEXT: LSHL T5.X, T2.W, PV.W,
1911 ; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
1912 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1913 ; EG-NEXT: MOV T5.Y, 0.0,
1914 ; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x,
1915 ; EG-NEXT: AND_INT * T2.W, T4.X, literal.y,
1916 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1917 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1918 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1919 ; EG-NEXT: LSHL T4.X, T2.W, PV.W,
1920 ; EG-NEXT: LSHL * T4.W, literal.x, PV.W,
1921 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1922 ; EG-NEXT: MOV T4.Y, 0.0,
1923 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
1924 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1925 ; EG-NEXT: AND_INT T2.W, PV.W, literal.x,
1926 ; EG-NEXT: AND_INT * T3.W, T3.X, literal.y,
1927 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1928 ; EG-NEXT: LSHL * T2.W, PV.W, literal.x,
1929 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1930 ; EG-NEXT: LSHL T3.X, T3.W, PV.W,
1931 ; EG-NEXT: LSHL * T3.W, literal.x, PV.W,
1932 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1933 ; EG-NEXT: MOV T3.Y, 0.0,
1934 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
1935 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1936 ; EG-NEXT: AND_INT T6.W, PV.W, literal.x,
1937 ; EG-NEXT: AND_INT * T7.W, T2.X, literal.y,
1938 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1939 ; EG-NEXT: LSHL * T6.W, PV.W, literal.x,
1940 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1941 ; EG-NEXT: LSHL T6.X, T7.W, PV.W,
1942 ; EG-NEXT: LSHL * T6.W, literal.x, PV.W,
1943 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1944 ; EG-NEXT: MOV T6.Y, 0.0,
1945 ; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
1946 ; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
1947 ; EG-NEXT: AND_INT T8.W, PV.W, literal.x,
1948 ; EG-NEXT: AND_INT * T9.W, T1.X, literal.y,
1949 ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
1950 ; EG-NEXT: LSHL * T8.W, PV.W, literal.x,
1951 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1952 ; EG-NEXT: LSHL T8.X, T9.W, PV.W,
1953 ; EG-NEXT: LSHL * T8.W, literal.x, PV.W,
1954 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1955 ; EG-NEXT: MOV T8.Y, 0.0,
1956 ; EG-NEXT: MOV T5.Z, 0.0,
1957 ; EG-NEXT: MOV * T4.Z, 0.0,
1958 ; EG-NEXT: MOV T3.Z, 0.0,
1959 ; EG-NEXT: MOV * T6.Z, 0.0,
1960 ; EG-NEXT: MOV * T8.Z, 0.0,
1961 ; EG-NEXT: LSHR T0.X, T7.W, literal.x,
1962 ; EG-NEXT: LSHR * T1.X, T2.W, literal.x,
1963 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1964 ; EG-NEXT: LSHR T2.X, T1.W, literal.x,
1965 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
1966 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1967 ; EG-NEXT: LSHR * T9.X, T0.W, literal.x,
1968 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1970 ; CM-LABEL: v5i16_arg:
1971 ; CM: ; %bb.0: ; %entry
1972 ; CM-NEXT: ALU 0, @20, KC0[], KC1[]
1973 ; CM-NEXT: TEX 4 @10
1974 ; CM-NEXT: ALU 67, @21, KC0[CB0:0-32], KC1[]
1975 ; CM-NEXT: MEM_RAT MSKOR T5.XW, T9.X
1976 ; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X
1977 ; CM-NEXT: MEM_RAT MSKOR T3.XW, T2.X
1978 ; CM-NEXT: MEM_RAT MSKOR T6.XW, T1.X
1979 ; CM-NEXT: MEM_RAT MSKOR T8.XW, T0.X
1982 ; CM-NEXT: Fetch clause starting at 10:
1983 ; CM-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3
1984 ; CM-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3
1985 ; CM-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3
1986 ; CM-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3
1987 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3
1988 ; CM-NEXT: ALU clause starting at 20:
1989 ; CM-NEXT: MOV * T0.X, 0.0,
1990 ; CM-NEXT: ALU clause starting at 21:
1991 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
1992 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1993 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
1994 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1995 ; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
1996 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
1997 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1998 ; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
1999 ; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
2000 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2001 ; CM-NEXT: MOV T5.Y, 0.0,
2002 ; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
2003 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2004 ; CM-NEXT: AND_INT T0.Z, T4.X, literal.x,
2005 ; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
2006 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
2007 ; CM-NEXT: LSHL T4.X, PV.Z, PV.W,
2008 ; CM-NEXT: LSHL * T4.W, literal.x, PV.W,
2009 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2010 ; CM-NEXT: MOV T4.Y, 0.0,
2011 ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2012 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2013 ; CM-NEXT: AND_INT * T2.W, PV.W, literal.x,
2014 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2015 ; CM-NEXT: AND_INT T0.Z, T3.X, literal.x,
2016 ; CM-NEXT: LSHL * T2.W, PV.W, literal.y,
2017 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
2018 ; CM-NEXT: LSHL T3.X, PV.Z, PV.W,
2019 ; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
2020 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2021 ; CM-NEXT: MOV T3.Y, 0.0,
2022 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2023 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
2024 ; CM-NEXT: AND_INT * T6.W, PV.W, literal.x,
2025 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2026 ; CM-NEXT: AND_INT T0.Z, T2.X, literal.x,
2027 ; CM-NEXT: LSHL * T6.W, PV.W, literal.y,
2028 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
2029 ; CM-NEXT: LSHL T6.X, PV.Z, PV.W,
2030 ; CM-NEXT: LSHL * T6.W, literal.x, PV.W,
2031 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2032 ; CM-NEXT: MOV T6.Y, 0.0,
2033 ; CM-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x,
2034 ; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00)
2035 ; CM-NEXT: AND_INT * T8.W, PV.W, literal.x,
2036 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2037 ; CM-NEXT: AND_INT T0.Z, T1.X, literal.x,
2038 ; CM-NEXT: LSHL * T8.W, PV.W, literal.y,
2039 ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
2040 ; CM-NEXT: LSHL T8.X, PV.Z, PV.W,
2041 ; CM-NEXT: LSHL * T8.W, literal.x, PV.W,
2042 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2043 ; CM-NEXT: MOV T8.Y, 0.0,
2044 ; CM-NEXT: MOV * T5.Z, 0.0,
2045 ; CM-NEXT: MOV * T4.Z, 0.0,
2046 ; CM-NEXT: MOV * T3.Z, 0.0,
2047 ; CM-NEXT: MOV * T6.Z, 0.0,
2048 ; CM-NEXT: MOV * T8.Z, 0.0,
2049 ; CM-NEXT: LSHR * T0.X, T7.W, literal.x,
2050 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2051 ; CM-NEXT: LSHR * T1.X, T2.W, literal.x,
2052 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2053 ; CM-NEXT: LSHR * T2.X, T1.W, literal.x,
2054 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2055 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
2056 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2057 ; CM-NEXT: LSHR * T9.X, T0.W, literal.x,
2058 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2060 store <5 x i16> %in, ptr addrspace(1) %out, align 4
2064 define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind {
2065 ; SI-LABEL: v5i32_arg:
2066 ; SI: ; %bb.0: ; %entry
2067 ; SI-NEXT: s_load_dword s8, s[0:1], 0x15
2068 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2069 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11
2070 ; SI-NEXT: s_mov_b32 s7, 0xf000
2071 ; SI-NEXT: s_mov_b32 s6, -1
2072 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2073 ; SI-NEXT: v_mov_b32_e32 v0, s8
2074 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16
2075 ; SI-NEXT: s_waitcnt expcnt(0)
2076 ; SI-NEXT: v_mov_b32_e32 v0, s0
2077 ; SI-NEXT: v_mov_b32_e32 v1, s1
2078 ; SI-NEXT: v_mov_b32_e32 v2, s2
2079 ; SI-NEXT: v_mov_b32_e32 v3, s3
2080 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2083 ; VI-LABEL: v5i32_arg:
2084 ; VI: ; %bb.0: ; %entry
2085 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2086 ; VI-NEXT: s_load_dword s7, s[0:1], 0x54
2087 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
2088 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2089 ; VI-NEXT: s_add_u32 s6, s4, 16
2090 ; VI-NEXT: v_mov_b32_e32 v2, s7
2091 ; VI-NEXT: s_addc_u32 s7, s5, 0
2092 ; VI-NEXT: v_mov_b32_e32 v0, s6
2093 ; VI-NEXT: v_mov_b32_e32 v1, s7
2094 ; VI-NEXT: v_mov_b32_e32 v4, s4
2095 ; VI-NEXT: flat_store_dword v[0:1], v2
2096 ; VI-NEXT: v_mov_b32_e32 v0, s0
2097 ; VI-NEXT: v_mov_b32_e32 v5, s5
2098 ; VI-NEXT: v_mov_b32_e32 v1, s1
2099 ; VI-NEXT: v_mov_b32_e32 v2, s2
2100 ; VI-NEXT: v_mov_b32_e32 v3, s3
2101 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2104 ; GFX9-LABEL: v5i32_arg:
2105 ; GFX9: ; %bb.0: ; %entry
2106 ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30
2107 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
2108 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
2109 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2111 ; GFX9-NEXT: v_mov_b32_e32 v5, s8
2112 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2113 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2114 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2115 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2116 ; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16
2117 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
2118 ; GFX9-NEXT: s_endpgm
2120 ; EG-LABEL: v5i32_arg:
2121 ; EG: ; %bb.0: ; %entry
2122 ; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
2123 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2124 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2126 ; EG-NEXT: ALU clause starting at 4:
2127 ; EG-NEXT: MOV * T0.W, KC0[5].X,
2128 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
2129 ; EG-NEXT: MOV * T0.Y, KC0[4].Z,
2130 ; EG-NEXT: MOV T0.X, KC0[4].Y,
2131 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2132 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2133 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2134 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2135 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
2136 ; EG-NEXT: MOV * T3.X, KC0[5].Y,
2137 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2139 ; CM-LABEL: v5i32_arg:
2140 ; CM: ; %bb.0: ; %entry
2141 ; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
2142 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2143 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2145 ; CM-NEXT: ALU clause starting at 4:
2146 ; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
2147 ; CM-NEXT: MOV * T0.W, KC0[5].X,
2148 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2149 ; CM-NEXT: LSHR T1.X, PV.Z, literal.x,
2150 ; CM-NEXT: MOV * T0.Z, KC0[4].W,
2151 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2152 ; CM-NEXT: MOV T2.X, KC0[5].Y,
2153 ; CM-NEXT: MOV * T0.Y, KC0[4].Z,
2154 ; CM-NEXT: MOV * T0.X, KC0[4].Y,
2155 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2156 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2158 store <5 x i32> %in, ptr addrspace(1) %out, align 4
2162 define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind {
2163 ; SI-LABEL: v5f32_arg:
2164 ; SI: ; %bb.0: ; %entry
2165 ; SI-NEXT: s_load_dword s8, s[0:1], 0x15
2166 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2167 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11
2168 ; SI-NEXT: s_mov_b32 s7, 0xf000
2169 ; SI-NEXT: s_mov_b32 s6, -1
2170 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2171 ; SI-NEXT: v_mov_b32_e32 v0, s8
2172 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16
2173 ; SI-NEXT: s_waitcnt expcnt(0)
2174 ; SI-NEXT: v_mov_b32_e32 v0, s0
2175 ; SI-NEXT: v_mov_b32_e32 v1, s1
2176 ; SI-NEXT: v_mov_b32_e32 v2, s2
2177 ; SI-NEXT: v_mov_b32_e32 v3, s3
2178 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2181 ; VI-LABEL: v5f32_arg:
2182 ; VI: ; %bb.0: ; %entry
2183 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2184 ; VI-NEXT: s_load_dword s7, s[0:1], 0x54
2185 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
2186 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2187 ; VI-NEXT: s_add_u32 s6, s4, 16
2188 ; VI-NEXT: v_mov_b32_e32 v3, s7
2189 ; VI-NEXT: s_addc_u32 s7, s5, 0
2190 ; VI-NEXT: v_mov_b32_e32 v1, s6
2191 ; VI-NEXT: v_mov_b32_e32 v2, s7
2192 ; VI-NEXT: v_mov_b32_e32 v4, s4
2193 ; VI-NEXT: v_mov_b32_e32 v0, s0
2194 ; VI-NEXT: flat_store_dword v[1:2], v3
2195 ; VI-NEXT: v_mov_b32_e32 v1, s1
2196 ; VI-NEXT: v_mov_b32_e32 v2, s2
2197 ; VI-NEXT: v_mov_b32_e32 v3, s3
2198 ; VI-NEXT: v_mov_b32_e32 v5, s5
2199 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2202 ; GFX9-LABEL: v5f32_arg:
2203 ; GFX9: ; %bb.0: ; %entry
2204 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
2205 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
2206 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2207 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
2208 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2209 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2210 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2211 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2212 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2213 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
2214 ; GFX9-NEXT: s_nop 0
2215 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2216 ; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16
2217 ; GFX9-NEXT: s_endpgm
2219 ; EG-LABEL: v5f32_arg:
2220 ; EG: ; %bb.0: ; %entry
2221 ; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
2222 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2223 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2225 ; EG-NEXT: ALU clause starting at 4:
2226 ; EG-NEXT: MOV * T0.W, KC0[5].X,
2227 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
2228 ; EG-NEXT: MOV * T0.Y, KC0[4].Z,
2229 ; EG-NEXT: MOV T0.X, KC0[4].Y,
2230 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2231 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2232 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2233 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2234 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
2235 ; EG-NEXT: MOV * T3.X, KC0[5].Y,
2236 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2238 ; CM-LABEL: v5f32_arg:
2239 ; CM: ; %bb.0: ; %entry
2240 ; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
2241 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2242 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2244 ; CM-NEXT: ALU clause starting at 4:
2245 ; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
2246 ; CM-NEXT: MOV * T0.W, KC0[5].X,
2247 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2248 ; CM-NEXT: LSHR T1.X, PV.Z, literal.x,
2249 ; CM-NEXT: MOV * T0.Z, KC0[4].W,
2250 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2251 ; CM-NEXT: MOV T2.X, KC0[5].Y,
2252 ; CM-NEXT: MOV * T0.Y, KC0[4].Z,
2253 ; CM-NEXT: MOV * T0.X, KC0[4].Y,
2254 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2255 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2257 store <5 x float> %in, ptr addrspace(1) %out, align 4
2261 define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind {
2262 ; SI-LABEL: v5i64_arg:
2263 ; SI: ; %bb.0: ; %entry
2264 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19
2265 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
2266 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21
2267 ; SI-NEXT: s_mov_b32 s15, 0xf000
2268 ; SI-NEXT: s_mov_b32 s14, -1
2269 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2270 ; SI-NEXT: v_mov_b32_e32 v0, s8
2271 ; SI-NEXT: v_mov_b32_e32 v1, s9
2272 ; SI-NEXT: v_mov_b32_e32 v2, s10
2273 ; SI-NEXT: v_mov_b32_e32 v3, s11
2274 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
2275 ; SI-NEXT: s_waitcnt expcnt(0)
2276 ; SI-NEXT: v_mov_b32_e32 v0, s4
2277 ; SI-NEXT: v_mov_b32_e32 v1, s5
2278 ; SI-NEXT: v_mov_b32_e32 v2, s6
2279 ; SI-NEXT: v_mov_b32_e32 v3, s7
2280 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2281 ; SI-NEXT: s_waitcnt expcnt(0)
2282 ; SI-NEXT: v_mov_b32_e32 v0, s0
2283 ; SI-NEXT: v_mov_b32_e32 v1, s1
2284 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
2287 ; VI-LABEL: v5i64_arg:
2288 ; VI: ; %bb.0: ; %entry
2289 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
2290 ; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
2291 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
2292 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2293 ; VI-NEXT: s_add_u32 s12, s8, 32
2294 ; VI-NEXT: v_mov_b32_e32 v1, s10
2295 ; VI-NEXT: s_addc_u32 s13, s9, 0
2296 ; VI-NEXT: v_mov_b32_e32 v3, s12
2297 ; VI-NEXT: v_mov_b32_e32 v2, s11
2298 ; VI-NEXT: v_mov_b32_e32 v0, s4
2299 ; VI-NEXT: v_mov_b32_e32 v4, s13
2300 ; VI-NEXT: s_add_u32 s4, s8, 16
2301 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
2302 ; VI-NEXT: v_mov_b32_e32 v1, s5
2303 ; VI-NEXT: s_addc_u32 s5, s9, 0
2304 ; VI-NEXT: v_mov_b32_e32 v4, s4
2305 ; VI-NEXT: v_mov_b32_e32 v2, s6
2306 ; VI-NEXT: v_mov_b32_e32 v3, s7
2307 ; VI-NEXT: v_mov_b32_e32 v5, s5
2308 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2309 ; VI-NEXT: v_mov_b32_e32 v4, s8
2310 ; VI-NEXT: v_mov_b32_e32 v0, s0
2311 ; VI-NEXT: v_mov_b32_e32 v1, s1
2312 ; VI-NEXT: v_mov_b32_e32 v2, s2
2313 ; VI-NEXT: v_mov_b32_e32 v3, s3
2314 ; VI-NEXT: v_mov_b32_e32 v5, s9
2315 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2318 ; GFX9-LABEL: v5i64_arg:
2319 ; GFX9: ; %bb.0: ; %entry
2320 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
2321 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
2322 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
2323 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2324 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2325 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2326 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
2327 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
2328 ; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
2329 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
2330 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
2331 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
2332 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
2333 ; GFX9-NEXT: s_nop 0
2334 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
2335 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
2336 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
2337 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
2338 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
2339 ; GFX9-NEXT: s_endpgm
2341 ; EG-LABEL: v5i64_arg:
2342 ; EG: ; %bb.0: ; %entry
2343 ; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2344 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2345 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2346 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2349 ; EG-NEXT: ALU clause starting at 6:
2350 ; EG-NEXT: MOV * T0.W, KC0[7].X,
2351 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
2352 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
2353 ; EG-NEXT: MOV * T1.W, KC0[8].X,
2354 ; EG-NEXT: MOV T0.X, KC0[6].Y,
2355 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
2356 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2357 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
2358 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2359 ; EG-NEXT: MOV T1.X, KC0[7].Y,
2360 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2361 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2362 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2363 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
2364 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2365 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
2366 ; EG-NEXT: MOV T5.Y, KC0[8].Z,
2367 ; EG-NEXT: MOV * T5.X, KC0[8].Y,
2368 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2370 ; CM-LABEL: v5i64_arg:
2371 ; CM: ; %bb.0: ; %entry
2372 ; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2373 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2374 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2375 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2378 ; CM-NEXT: ALU clause starting at 6:
2379 ; CM-NEXT: MOV * T0.W, KC0[8].X,
2380 ; CM-NEXT: MOV T1.Y, KC0[8].Z,
2381 ; CM-NEXT: MOV * T0.Z, KC0[7].W,
2382 ; CM-NEXT: MOV T1.X, KC0[8].Y,
2383 ; CM-NEXT: MOV * T0.Y, KC0[7].Z,
2384 ; CM-NEXT: MOV T0.X, KC0[7].Y,
2385 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
2386 ; CM-NEXT: MOV * T2.W, KC0[7].X,
2387 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
2388 ; CM-NEXT: LSHR T3.X, PV.Z, literal.x,
2389 ; CM-NEXT: MOV T2.Z, KC0[6].W,
2390 ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
2391 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2392 ; CM-NEXT: LSHR T4.X, PV.W, literal.x,
2393 ; CM-NEXT: MOV * T2.Y, KC0[6].Z,
2394 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2395 ; CM-NEXT: MOV * T2.X, KC0[6].Y,
2396 ; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
2397 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2399 store <5 x i64> %in, ptr addrspace(1) %out, align 8
2403 define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
2404 ; SI-LABEL: v5f64_arg:
2405 ; SI: ; %bb.0: ; %entry
2406 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19
2407 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
2408 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21
2409 ; SI-NEXT: s_mov_b32 s15, 0xf000
2410 ; SI-NEXT: s_mov_b32 s14, -1
2411 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2412 ; SI-NEXT: v_mov_b32_e32 v0, s8
2413 ; SI-NEXT: v_mov_b32_e32 v1, s9
2414 ; SI-NEXT: v_mov_b32_e32 v2, s10
2415 ; SI-NEXT: v_mov_b32_e32 v3, s11
2416 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
2417 ; SI-NEXT: s_waitcnt expcnt(0)
2418 ; SI-NEXT: v_mov_b32_e32 v0, s4
2419 ; SI-NEXT: v_mov_b32_e32 v1, s5
2420 ; SI-NEXT: v_mov_b32_e32 v2, s6
2421 ; SI-NEXT: v_mov_b32_e32 v3, s7
2422 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2423 ; SI-NEXT: s_waitcnt expcnt(0)
2424 ; SI-NEXT: v_mov_b32_e32 v0, s0
2425 ; SI-NEXT: v_mov_b32_e32 v1, s1
2426 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
2429 ; VI-LABEL: v5f64_arg:
2430 ; VI: ; %bb.0: ; %entry
2431 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
2432 ; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
2433 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
2434 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2435 ; VI-NEXT: s_add_u32 s12, s8, 32
2436 ; VI-NEXT: v_mov_b32_e32 v1, s10
2437 ; VI-NEXT: s_addc_u32 s13, s9, 0
2438 ; VI-NEXT: v_mov_b32_e32 v3, s12
2439 ; VI-NEXT: v_mov_b32_e32 v2, s11
2440 ; VI-NEXT: v_mov_b32_e32 v0, s4
2441 ; VI-NEXT: v_mov_b32_e32 v4, s13
2442 ; VI-NEXT: s_add_u32 s4, s8, 16
2443 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
2444 ; VI-NEXT: v_mov_b32_e32 v1, s5
2445 ; VI-NEXT: s_addc_u32 s5, s9, 0
2446 ; VI-NEXT: v_mov_b32_e32 v4, s4
2447 ; VI-NEXT: v_mov_b32_e32 v2, s6
2448 ; VI-NEXT: v_mov_b32_e32 v3, s7
2449 ; VI-NEXT: v_mov_b32_e32 v5, s5
2450 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2451 ; VI-NEXT: v_mov_b32_e32 v4, s8
2452 ; VI-NEXT: v_mov_b32_e32 v0, s0
2453 ; VI-NEXT: v_mov_b32_e32 v1, s1
2454 ; VI-NEXT: v_mov_b32_e32 v2, s2
2455 ; VI-NEXT: v_mov_b32_e32 v3, s3
2456 ; VI-NEXT: v_mov_b32_e32 v5, s9
2457 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2460 ; GFX9-LABEL: v5f64_arg:
2461 ; GFX9: ; %bb.0: ; %entry
2462 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
2463 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
2464 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
2465 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2466 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2467 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
2468 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
2469 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
2470 ; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32
2471 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
2472 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
2473 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
2474 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
2475 ; GFX9-NEXT: s_nop 0
2476 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
2477 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
2478 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
2479 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
2480 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
2481 ; GFX9-NEXT: s_endpgm
2483 ; EG-LABEL: v5f64_arg:
2484 ; EG: ; %bb.0: ; %entry
2485 ; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2486 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2487 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2488 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2491 ; EG-NEXT: ALU clause starting at 6:
2492 ; EG-NEXT: MOV * T0.W, KC0[7].X,
2493 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
2494 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
2495 ; EG-NEXT: MOV * T1.W, KC0[8].X,
2496 ; EG-NEXT: MOV T0.X, KC0[6].Y,
2497 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
2498 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2499 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
2500 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2501 ; EG-NEXT: MOV T1.X, KC0[7].Y,
2502 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2503 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2504 ; EG-NEXT: LSHR T3.X, PV.W, literal.x,
2505 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
2506 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
2507 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
2508 ; EG-NEXT: MOV T5.Y, KC0[8].Z,
2509 ; EG-NEXT: MOV * T5.X, KC0[8].Y,
2510 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2512 ; CM-LABEL: v5f64_arg:
2513 ; CM: ; %bb.0: ; %entry
2514 ; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[]
2515 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2516 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2517 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2520 ; CM-NEXT: ALU clause starting at 6:
2521 ; CM-NEXT: MOV * T0.W, KC0[8].X,
2522 ; CM-NEXT: MOV T1.Y, KC0[8].Z,
2523 ; CM-NEXT: MOV * T0.Z, KC0[7].W,
2524 ; CM-NEXT: MOV T1.X, KC0[8].Y,
2525 ; CM-NEXT: MOV * T0.Y, KC0[7].Z,
2526 ; CM-NEXT: MOV T0.X, KC0[7].Y,
2527 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
2528 ; CM-NEXT: MOV * T2.W, KC0[7].X,
2529 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
2530 ; CM-NEXT: LSHR T3.X, PV.Z, literal.x,
2531 ; CM-NEXT: MOV T2.Z, KC0[6].W,
2532 ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
2533 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
2534 ; CM-NEXT: LSHR T4.X, PV.W, literal.x,
2535 ; CM-NEXT: MOV * T2.Y, KC0[6].Z,
2536 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2537 ; CM-NEXT: MOV * T2.X, KC0[6].Y,
2538 ; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
2539 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2541 store <5 x double> %in, ptr addrspace(1) %out, align 8
2545 ; FIXME: Lots of unpack and re-pack junk on VI
2546 define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
2547 ; SI-LABEL: v8i8_arg:
2548 ; SI: ; %bb.0: ; %entry
2549 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2550 ; SI-NEXT: s_mov_b32 s7, 0xf000
2551 ; SI-NEXT: s_mov_b32 s6, -1
2552 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2553 ; SI-NEXT: s_mov_b32 s4, s0
2554 ; SI-NEXT: s_mov_b32 s5, s1
2555 ; SI-NEXT: v_mov_b32_e32 v0, s2
2556 ; SI-NEXT: v_mov_b32_e32 v1, s3
2557 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2560 ; VI-LABEL: v8i8_arg:
2561 ; VI: ; %bb.0: ; %entry
2562 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2563 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2564 ; VI-NEXT: v_mov_b32_e32 v0, s0
2565 ; VI-NEXT: v_mov_b32_e32 v2, s2
2566 ; VI-NEXT: v_mov_b32_e32 v1, s1
2567 ; VI-NEXT: v_mov_b32_e32 v3, s3
2568 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2571 ; GFX9-LABEL: v8i8_arg:
2572 ; GFX9: ; %bb.0: ; %entry
2573 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2574 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2575 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2576 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2577 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2578 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
2579 ; GFX9-NEXT: s_endpgm
2581 ; EG-LABEL: v8i8_arg:
2582 ; EG: ; %bb.0: ; %entry
2583 ; EG-NEXT: ALU 1, @36, KC0[], KC1[]
2584 ; EG-NEXT: TEX 0 @20
2585 ; EG-NEXT: ALU 5, @38, KC0[], KC1[]
2586 ; EG-NEXT: TEX 0 @22
2587 ; EG-NEXT: ALU 5, @44, KC0[], KC1[]
2588 ; EG-NEXT: TEX 0 @24
2589 ; EG-NEXT: ALU 7, @50, KC0[], KC1[]
2590 ; EG-NEXT: TEX 0 @26
2591 ; EG-NEXT: ALU 7, @58, KC0[], KC1[]
2592 ; EG-NEXT: TEX 0 @28
2593 ; EG-NEXT: ALU 7, @66, KC0[], KC1[]
2594 ; EG-NEXT: TEX 0 @30
2595 ; EG-NEXT: ALU 7, @74, KC0[], KC1[]
2596 ; EG-NEXT: TEX 0 @32
2597 ; EG-NEXT: ALU 5, @82, KC0[], KC1[]
2598 ; EG-NEXT: TEX 0 @34
2599 ; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
2600 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
2603 ; EG-NEXT: Fetch clause starting at 20:
2604 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
2605 ; EG-NEXT: Fetch clause starting at 22:
2606 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
2607 ; EG-NEXT: Fetch clause starting at 24:
2608 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
2609 ; EG-NEXT: Fetch clause starting at 26:
2610 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
2611 ; EG-NEXT: Fetch clause starting at 28:
2612 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
2613 ; EG-NEXT: Fetch clause starting at 30:
2614 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
2615 ; EG-NEXT: Fetch clause starting at 32:
2616 ; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
2617 ; EG-NEXT: Fetch clause starting at 34:
2618 ; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
2619 ; EG-NEXT: ALU clause starting at 36:
2620 ; EG-NEXT: MOV * T0.Y, T2.X,
2621 ; EG-NEXT: MOV * T5.X, 0.0,
2622 ; EG-NEXT: ALU clause starting at 38:
2623 ; EG-NEXT: LSHL T0.W, T6.X, literal.x,
2624 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2625 ; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
2626 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
2627 ; EG-NEXT: MOV T2.X, PV.W,
2628 ; EG-NEXT: MOV * T0.Y, T3.X,
2629 ; EG-NEXT: ALU clause starting at 44:
2630 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2631 ; EG-NEXT: LSHL * T1.W, T6.X, literal.y,
2632 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
2633 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2634 ; EG-NEXT: MOV T3.X, PV.W,
2635 ; EG-NEXT: MOV * T0.Y, T2.X,
2636 ; EG-NEXT: ALU clause starting at 50:
2637 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2638 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2639 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
2640 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2641 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2642 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2643 ; EG-NEXT: MOV T2.X, PV.W,
2644 ; EG-NEXT: MOV * T0.Y, T3.X,
2645 ; EG-NEXT: ALU clause starting at 58:
2646 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2647 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2648 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
2649 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2650 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2651 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2652 ; EG-NEXT: MOV T3.X, PV.W,
2653 ; EG-NEXT: MOV * T0.Y, T2.X,
2654 ; EG-NEXT: ALU clause starting at 66:
2655 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2656 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2657 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
2658 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2659 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2660 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2661 ; EG-NEXT: MOV T2.X, PV.W,
2662 ; EG-NEXT: MOV * T0.Y, T3.X,
2663 ; EG-NEXT: ALU clause starting at 74:
2664 ; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
2665 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2666 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
2667 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2668 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2669 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
2670 ; EG-NEXT: MOV T3.X, PV.W,
2671 ; EG-NEXT: MOV * T0.Y, T2.X,
2672 ; EG-NEXT: ALU clause starting at 82:
2673 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2674 ; EG-NEXT: AND_INT * T1.W, T6.X, literal.y,
2675 ; EG-NEXT: -256(nan), 255(3.573311e-43)
2676 ; EG-NEXT: OR_INT * T5.Y, PV.W, PS,
2677 ; EG-NEXT: MOV T2.X, PV.Y,
2678 ; EG-NEXT: MOV * T0.Y, T3.X,
2679 ; EG-NEXT: ALU clause starting at 88:
2680 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2681 ; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
2682 ; EG-NEXT: -256(nan), 255(3.573311e-43)
2683 ; EG-NEXT: OR_INT T5.X, PV.W, PS,
2684 ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
2685 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2687 ; CM-LABEL: v8i8_arg:
2688 ; CM: ; %bb.0: ; %entry
2689 ; CM-NEXT: ALU 1, @36, KC0[], KC1[]
2690 ; CM-NEXT: TEX 0 @20
2691 ; CM-NEXT: ALU 5, @38, KC0[], KC1[]
2692 ; CM-NEXT: TEX 0 @22
2693 ; CM-NEXT: ALU 5, @44, KC0[], KC1[]
2694 ; CM-NEXT: TEX 0 @24
2695 ; CM-NEXT: ALU 7, @50, KC0[], KC1[]
2696 ; CM-NEXT: TEX 0 @26
2697 ; CM-NEXT: ALU 7, @58, KC0[], KC1[]
2698 ; CM-NEXT: TEX 0 @28
2699 ; CM-NEXT: ALU 7, @66, KC0[], KC1[]
2700 ; CM-NEXT: TEX 0 @30
2701 ; CM-NEXT: ALU 7, @74, KC0[], KC1[]
2702 ; CM-NEXT: TEX 0 @32
2703 ; CM-NEXT: ALU 5, @82, KC0[], KC1[]
2704 ; CM-NEXT: TEX 0 @34
2705 ; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
2706 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
2709 ; CM-NEXT: Fetch clause starting at 20:
2710 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
2711 ; CM-NEXT: Fetch clause starting at 22:
2712 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
2713 ; CM-NEXT: Fetch clause starting at 24:
2714 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
2715 ; CM-NEXT: Fetch clause starting at 26:
2716 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
2717 ; CM-NEXT: Fetch clause starting at 28:
2718 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
2719 ; CM-NEXT: Fetch clause starting at 30:
2720 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
2721 ; CM-NEXT: Fetch clause starting at 32:
2722 ; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
2723 ; CM-NEXT: Fetch clause starting at 34:
2724 ; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
2725 ; CM-NEXT: ALU clause starting at 36:
2726 ; CM-NEXT: MOV * T0.Y, T2.X,
2727 ; CM-NEXT: MOV * T5.X, 0.0,
2728 ; CM-NEXT: ALU clause starting at 38:
2729 ; CM-NEXT: LSHL T0.Z, T6.X, literal.x,
2730 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
2731 ; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
2732 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
2733 ; CM-NEXT: MOV T2.X, PV.W,
2734 ; CM-NEXT: MOV * T0.Y, T3.X,
2735 ; CM-NEXT: ALU clause starting at 44:
2736 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2737 ; CM-NEXT: LSHL * T0.W, T6.X, literal.y,
2738 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
2739 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2740 ; CM-NEXT: MOV T3.X, PV.W,
2741 ; CM-NEXT: MOV * T0.Y, T2.X,
2742 ; CM-NEXT: ALU clause starting at 50:
2743 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2744 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2745 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2746 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2747 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
2748 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2749 ; CM-NEXT: MOV T2.X, PV.W,
2750 ; CM-NEXT: MOV * T0.Y, T3.X,
2751 ; CM-NEXT: ALU clause starting at 58:
2752 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2753 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2754 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2755 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2756 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
2757 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2758 ; CM-NEXT: MOV T3.X, PV.W,
2759 ; CM-NEXT: MOV * T0.Y, T2.X,
2760 ; CM-NEXT: ALU clause starting at 66:
2761 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2762 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2763 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2764 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2765 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
2766 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2767 ; CM-NEXT: MOV T2.X, PV.W,
2768 ; CM-NEXT: MOV * T0.Y, T3.X,
2769 ; CM-NEXT: ALU clause starting at 74:
2770 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
2771 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2772 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2773 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
2774 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
2775 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2776 ; CM-NEXT: MOV T3.X, PV.W,
2777 ; CM-NEXT: MOV * T0.Y, T2.X,
2778 ; CM-NEXT: ALU clause starting at 82:
2779 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2780 ; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
2781 ; CM-NEXT: -256(nan), 255(3.573311e-43)
2782 ; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W,
2783 ; CM-NEXT: MOV T2.X, PV.Y,
2784 ; CM-NEXT: MOV * T0.Y, T3.X,
2785 ; CM-NEXT: ALU clause starting at 88:
2786 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2787 ; CM-NEXT: AND_INT * T0.W, T5.X, literal.y,
2788 ; CM-NEXT: -256(nan), 255(3.573311e-43)
2789 ; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W,
2790 ; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
2791 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2793 store <8 x i8> %in, ptr addrspace(1) %out
2797 define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
2798 ; SI-LABEL: v8i16_arg:
2799 ; SI: ; %bb.0: ; %entry
2800 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
2801 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2802 ; SI-NEXT: s_mov_b32 s3, 0xf000
2803 ; SI-NEXT: s_mov_b32 s2, -1
2804 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2805 ; SI-NEXT: v_mov_b32_e32 v0, s4
2806 ; SI-NEXT: v_mov_b32_e32 v1, s5
2807 ; SI-NEXT: v_mov_b32_e32 v2, s6
2808 ; SI-NEXT: v_mov_b32_e32 v3, s7
2809 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2812 ; VI-LABEL: v8i16_arg:
2813 ; VI: ; %bb.0: ; %entry
2814 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2815 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
2816 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2817 ; VI-NEXT: v_mov_b32_e32 v4, s4
2818 ; VI-NEXT: v_mov_b32_e32 v0, s0
2819 ; VI-NEXT: v_mov_b32_e32 v5, s5
2820 ; VI-NEXT: v_mov_b32_e32 v1, s1
2821 ; VI-NEXT: v_mov_b32_e32 v2, s2
2822 ; VI-NEXT: v_mov_b32_e32 v3, s3
2823 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2826 ; GFX9-LABEL: v8i16_arg:
2827 ; GFX9: ; %bb.0: ; %entry
2828 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
2829 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
2830 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2831 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2832 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2833 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2834 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2835 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2836 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
2837 ; GFX9-NEXT: s_endpgm
2839 ; EG-LABEL: v8i16_arg:
2840 ; EG: ; %bb.0: ; %entry
2841 ; EG-NEXT: ALU 1, @36, KC0[], KC1[]
2842 ; EG-NEXT: TEX 0 @20
2843 ; EG-NEXT: ALU 5, @38, KC0[], KC1[]
2844 ; EG-NEXT: TEX 0 @22
2845 ; EG-NEXT: ALU 5, @44, KC0[], KC1[]
2846 ; EG-NEXT: TEX 0 @24
2847 ; EG-NEXT: ALU 5, @50, KC0[], KC1[]
2848 ; EG-NEXT: TEX 0 @26
2849 ; EG-NEXT: ALU 5, @56, KC0[], KC1[]
2850 ; EG-NEXT: TEX 0 @28
2851 ; EG-NEXT: ALU 5, @62, KC0[], KC1[]
2852 ; EG-NEXT: TEX 0 @30
2853 ; EG-NEXT: ALU 5, @68, KC0[], KC1[]
2854 ; EG-NEXT: TEX 0 @32
2855 ; EG-NEXT: ALU 5, @74, KC0[], KC1[]
2856 ; EG-NEXT: TEX 0 @34
2857 ; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
2858 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2861 ; EG-NEXT: Fetch clause starting at 20:
2862 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
2863 ; EG-NEXT: Fetch clause starting at 22:
2864 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
2865 ; EG-NEXT: Fetch clause starting at 24:
2866 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
2867 ; EG-NEXT: Fetch clause starting at 26:
2868 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
2869 ; EG-NEXT: Fetch clause starting at 28:
2870 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
2871 ; EG-NEXT: Fetch clause starting at 30:
2872 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
2873 ; EG-NEXT: Fetch clause starting at 32:
2874 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
2875 ; EG-NEXT: Fetch clause starting at 34:
2876 ; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
2877 ; EG-NEXT: ALU clause starting at 36:
2878 ; EG-NEXT: MOV * T0.Y, T3.X,
2879 ; EG-NEXT: MOV * T7.X, 0.0,
2880 ; EG-NEXT: ALU clause starting at 38:
2881 ; EG-NEXT: LSHL T0.W, T8.X, literal.x,
2882 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
2883 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
2884 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
2885 ; EG-NEXT: MOV T3.X, PV.W,
2886 ; EG-NEXT: MOV * T0.Y, T5.X,
2887 ; EG-NEXT: ALU clause starting at 44:
2888 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2889 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
2890 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2891 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2892 ; EG-NEXT: MOV T5.X, PV.W,
2893 ; EG-NEXT: MOV * T0.Y, T3.X,
2894 ; EG-NEXT: ALU clause starting at 50:
2895 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2896 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
2897 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
2898 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2899 ; EG-NEXT: MOV T3.X, PV.W,
2900 ; EG-NEXT: MOV * T0.Y, T5.X,
2901 ; EG-NEXT: ALU clause starting at 56:
2902 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2903 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
2904 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
2905 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2906 ; EG-NEXT: MOV T5.X, PV.W,
2907 ; EG-NEXT: MOV * T0.Y, T2.X,
2908 ; EG-NEXT: ALU clause starting at 62:
2909 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2910 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
2911 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2912 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2913 ; EG-NEXT: MOV T2.X, PV.W,
2914 ; EG-NEXT: MOV * T0.Y, T4.X,
2915 ; EG-NEXT: ALU clause starting at 68:
2916 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2917 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
2918 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2919 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
2920 ; EG-NEXT: MOV T4.X, PV.W,
2921 ; EG-NEXT: MOV * T0.Y, T2.X,
2922 ; EG-NEXT: ALU clause starting at 74:
2923 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
2924 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
2925 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
2926 ; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
2927 ; EG-NEXT: MOV T2.X, PV.Z,
2928 ; EG-NEXT: MOV * T0.Y, T4.X,
2929 ; EG-NEXT: ALU clause starting at 80:
2930 ; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
2931 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
2932 ; EG-NEXT: AND_INT * T1.W, T7.X, literal.z,
2933 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
2934 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2935 ; EG-NEXT: OR_INT * T7.X, PV.W, PS,
2936 ; EG-NEXT: MOV T4.X, PV.X,
2937 ; EG-NEXT: MOV * T7.W, T3.X,
2938 ; EG-NEXT: MOV * T7.Y, T5.X,
2940 ; CM-LABEL: v8i16_arg:
2941 ; CM: ; %bb.0: ; %entry
2942 ; CM-NEXT: ALU 1, @36, KC0[], KC1[]
2943 ; CM-NEXT: TEX 0 @20
2944 ; CM-NEXT: ALU 5, @38, KC0[], KC1[]
2945 ; CM-NEXT: TEX 0 @22
2946 ; CM-NEXT: ALU 5, @44, KC0[], KC1[]
2947 ; CM-NEXT: TEX 0 @24
2948 ; CM-NEXT: ALU 5, @50, KC0[], KC1[]
2949 ; CM-NEXT: TEX 0 @26
2950 ; CM-NEXT: ALU 5, @56, KC0[], KC1[]
2951 ; CM-NEXT: TEX 0 @28
2952 ; CM-NEXT: ALU 5, @62, KC0[], KC1[]
2953 ; CM-NEXT: TEX 0 @30
2954 ; CM-NEXT: ALU 5, @68, KC0[], KC1[]
2955 ; CM-NEXT: TEX 0 @32
2956 ; CM-NEXT: ALU 5, @74, KC0[], KC1[]
2957 ; CM-NEXT: TEX 0 @34
2958 ; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[]
2959 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
2962 ; CM-NEXT: Fetch clause starting at 20:
2963 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
2964 ; CM-NEXT: Fetch clause starting at 22:
2965 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
2966 ; CM-NEXT: Fetch clause starting at 24:
2967 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
2968 ; CM-NEXT: Fetch clause starting at 26:
2969 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
2970 ; CM-NEXT: Fetch clause starting at 28:
2971 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
2972 ; CM-NEXT: Fetch clause starting at 30:
2973 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
2974 ; CM-NEXT: Fetch clause starting at 32:
2975 ; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
2976 ; CM-NEXT: Fetch clause starting at 34:
2977 ; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3
2978 ; CM-NEXT: ALU clause starting at 36:
2979 ; CM-NEXT: MOV * T0.Y, T3.X,
2980 ; CM-NEXT: MOV * T7.X, 0.0,
2981 ; CM-NEXT: ALU clause starting at 38:
2982 ; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
2983 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
2984 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
2985 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
2986 ; CM-NEXT: MOV T3.X, PV.W,
2987 ; CM-NEXT: MOV * T0.Y, T5.X,
2988 ; CM-NEXT: ALU clause starting at 44:
2989 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2990 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
2991 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
2992 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
2993 ; CM-NEXT: MOV T5.X, PV.W,
2994 ; CM-NEXT: MOV * T0.Y, T3.X,
2995 ; CM-NEXT: ALU clause starting at 50:
2996 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
2997 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
2998 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
2999 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3000 ; CM-NEXT: MOV T3.X, PV.W,
3001 ; CM-NEXT: MOV * T0.Y, T5.X,
3002 ; CM-NEXT: ALU clause starting at 56:
3003 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3004 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3005 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3006 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3007 ; CM-NEXT: MOV T5.X, PV.W,
3008 ; CM-NEXT: MOV * T0.Y, T2.X,
3009 ; CM-NEXT: ALU clause starting at 62:
3010 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3011 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3012 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3013 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3014 ; CM-NEXT: MOV T2.X, PV.W,
3015 ; CM-NEXT: MOV * T0.Y, T4.X,
3016 ; CM-NEXT: ALU clause starting at 68:
3017 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3018 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3019 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3020 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3021 ; CM-NEXT: MOV T4.X, PV.W,
3022 ; CM-NEXT: MOV * T0.Y, T2.X,
3023 ; CM-NEXT: ALU clause starting at 74:
3024 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3025 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3026 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
3027 ; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
3028 ; CM-NEXT: MOV T2.X, PV.Z,
3029 ; CM-NEXT: MOV * T0.Y, T4.X,
3030 ; CM-NEXT: ALU clause starting at 80:
3031 ; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
3032 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
3033 ; CM-NEXT: AND_INT * T0.W, T7.X, literal.z,
3034 ; CM-NEXT: 2(2.802597e-45), -65536(nan)
3035 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3036 ; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
3037 ; CM-NEXT: MOV T4.X, PV.X,
3038 ; CM-NEXT: MOV * T7.W, T3.X,
3039 ; CM-NEXT: MOV * T7.Y, T5.X,
3041 store <8 x i16> %in, ptr addrspace(1) %out
3045 define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
3046 ; SI-LABEL: v8i32_arg:
3047 ; SI: ; %bb.0: ; %entry
3048 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
3049 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3050 ; SI-NEXT: s_mov_b32 s3, 0xf000
3051 ; SI-NEXT: s_mov_b32 s2, -1
3052 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3053 ; SI-NEXT: v_mov_b32_e32 v0, s8
3054 ; SI-NEXT: v_mov_b32_e32 v1, s9
3055 ; SI-NEXT: v_mov_b32_e32 v2, s10
3056 ; SI-NEXT: v_mov_b32_e32 v3, s11
3057 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3058 ; SI-NEXT: s_waitcnt expcnt(0)
3059 ; SI-NEXT: v_mov_b32_e32 v0, s4
3060 ; SI-NEXT: v_mov_b32_e32 v1, s5
3061 ; SI-NEXT: v_mov_b32_e32 v2, s6
3062 ; SI-NEXT: v_mov_b32_e32 v3, s7
3063 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3066 ; VI-LABEL: v8i32_arg:
3067 ; VI: ; %bb.0: ; %entry
3068 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
3069 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3070 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3071 ; VI-NEXT: v_mov_b32_e32 v0, s8
3072 ; VI-NEXT: s_add_u32 s2, s0, 16
3073 ; VI-NEXT: s_addc_u32 s3, s1, 0
3074 ; VI-NEXT: v_mov_b32_e32 v5, s3
3075 ; VI-NEXT: v_mov_b32_e32 v1, s9
3076 ; VI-NEXT: v_mov_b32_e32 v2, s10
3077 ; VI-NEXT: v_mov_b32_e32 v3, s11
3078 ; VI-NEXT: v_mov_b32_e32 v4, s2
3079 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3080 ; VI-NEXT: v_mov_b32_e32 v5, s1
3081 ; VI-NEXT: v_mov_b32_e32 v0, s4
3082 ; VI-NEXT: v_mov_b32_e32 v1, s5
3083 ; VI-NEXT: v_mov_b32_e32 v2, s6
3084 ; VI-NEXT: v_mov_b32_e32 v3, s7
3085 ; VI-NEXT: v_mov_b32_e32 v4, s0
3086 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3089 ; GFX9-LABEL: v8i32_arg:
3090 ; GFX9: ; %bb.0: ; %entry
3091 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
3092 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3093 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3094 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3095 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
3096 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
3097 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
3098 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
3099 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3100 ; GFX9-NEXT: s_nop 0
3101 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
3102 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
3103 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
3104 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
3105 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3106 ; GFX9-NEXT: s_endpgm
3108 ; EG-LABEL: v8i32_arg:
3109 ; EG: ; %bb.0: ; %entry
3110 ; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
3111 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3112 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3114 ; EG-NEXT: ALU clause starting at 4:
3115 ; EG-NEXT: MOV * T0.W, KC0[5].X,
3116 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
3117 ; EG-NEXT: MOV T0.Y, KC0[4].Z,
3118 ; EG-NEXT: MOV * T1.W, KC0[6].X,
3119 ; EG-NEXT: MOV T0.X, KC0[4].Y,
3120 ; EG-NEXT: MOV * T1.Z, KC0[5].W,
3121 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
3122 ; EG-NEXT: MOV * T1.Y, KC0[5].Z,
3123 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3124 ; EG-NEXT: MOV T1.X, KC0[5].Y,
3125 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
3126 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3127 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
3128 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3130 ; CM-LABEL: v8i32_arg:
3131 ; CM: ; %bb.0: ; %entry
3132 ; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
3133 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3134 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3136 ; CM-NEXT: ALU clause starting at 4:
3137 ; CM-NEXT: MOV * T0.W, KC0[6].X,
3138 ; CM-NEXT: MOV * T0.Z, KC0[5].W,
3139 ; CM-NEXT: MOV * T0.Y, KC0[5].Z,
3140 ; CM-NEXT: MOV T0.X, KC0[5].Y,
3141 ; CM-NEXT: MOV * T1.W, KC0[5].X,
3142 ; CM-NEXT: MOV T1.Z, KC0[4].W,
3143 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
3144 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3145 ; CM-NEXT: LSHR T2.X, PV.W, literal.x,
3146 ; CM-NEXT: MOV * T1.Y, KC0[4].Z,
3147 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3148 ; CM-NEXT: MOV * T1.X, KC0[4].Y,
3149 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
3150 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3152 store <8 x i32> %in, ptr addrspace(1) %out, align 4
3156 define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind {
3157 ; SI-LABEL: v8f32_arg:
3158 ; SI: ; %bb.0: ; %entry
3159 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
3160 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3161 ; SI-NEXT: s_mov_b32 s3, 0xf000
3162 ; SI-NEXT: s_mov_b32 s2, -1
3163 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3164 ; SI-NEXT: v_mov_b32_e32 v0, s8
3165 ; SI-NEXT: v_mov_b32_e32 v1, s9
3166 ; SI-NEXT: v_mov_b32_e32 v2, s10
3167 ; SI-NEXT: v_mov_b32_e32 v3, s11
3168 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3169 ; SI-NEXT: s_waitcnt expcnt(0)
3170 ; SI-NEXT: v_mov_b32_e32 v0, s4
3171 ; SI-NEXT: v_mov_b32_e32 v1, s5
3172 ; SI-NEXT: v_mov_b32_e32 v2, s6
3173 ; SI-NEXT: v_mov_b32_e32 v3, s7
3174 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3177 ; VI-LABEL: v8f32_arg:
3178 ; VI: ; %bb.0: ; %entry
3179 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
3180 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3181 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3182 ; VI-NEXT: v_mov_b32_e32 v0, s8
3183 ; VI-NEXT: s_add_u32 s2, s0, 16
3184 ; VI-NEXT: s_addc_u32 s3, s1, 0
3185 ; VI-NEXT: v_mov_b32_e32 v5, s3
3186 ; VI-NEXT: v_mov_b32_e32 v1, s9
3187 ; VI-NEXT: v_mov_b32_e32 v2, s10
3188 ; VI-NEXT: v_mov_b32_e32 v3, s11
3189 ; VI-NEXT: v_mov_b32_e32 v4, s2
3190 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3191 ; VI-NEXT: v_mov_b32_e32 v5, s1
3192 ; VI-NEXT: v_mov_b32_e32 v0, s4
3193 ; VI-NEXT: v_mov_b32_e32 v1, s5
3194 ; VI-NEXT: v_mov_b32_e32 v2, s6
3195 ; VI-NEXT: v_mov_b32_e32 v3, s7
3196 ; VI-NEXT: v_mov_b32_e32 v4, s0
3197 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3200 ; GFX9-LABEL: v8f32_arg:
3201 ; GFX9: ; %bb.0: ; %entry
3202 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
3203 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3204 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3205 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3206 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
3207 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
3208 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
3209 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
3210 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3211 ; GFX9-NEXT: s_nop 0
3212 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
3213 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
3214 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
3215 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
3216 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3217 ; GFX9-NEXT: s_endpgm
3219 ; EG-LABEL: v8f32_arg:
3220 ; EG: ; %bb.0: ; %entry
3221 ; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
3222 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3223 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3225 ; EG-NEXT: ALU clause starting at 4:
3226 ; EG-NEXT: MOV * T0.W, KC0[5].X,
3227 ; EG-NEXT: MOV * T0.Z, KC0[4].W,
3228 ; EG-NEXT: MOV T0.Y, KC0[4].Z,
3229 ; EG-NEXT: MOV * T1.W, KC0[6].X,
3230 ; EG-NEXT: MOV T0.X, KC0[4].Y,
3231 ; EG-NEXT: MOV * T1.Z, KC0[5].W,
3232 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
3233 ; EG-NEXT: MOV * T1.Y, KC0[5].Z,
3234 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3235 ; EG-NEXT: MOV T1.X, KC0[5].Y,
3236 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
3237 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3238 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
3239 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3241 ; CM-LABEL: v8f32_arg:
3242 ; CM: ; %bb.0: ; %entry
3243 ; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
3244 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3245 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3247 ; CM-NEXT: ALU clause starting at 4:
3248 ; CM-NEXT: MOV * T0.W, KC0[6].X,
3249 ; CM-NEXT: MOV * T0.Z, KC0[5].W,
3250 ; CM-NEXT: MOV * T0.Y, KC0[5].Z,
3251 ; CM-NEXT: MOV T0.X, KC0[5].Y,
3252 ; CM-NEXT: MOV * T1.W, KC0[5].X,
3253 ; CM-NEXT: MOV T1.Z, KC0[4].W,
3254 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
3255 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3256 ; CM-NEXT: LSHR T2.X, PV.W, literal.x,
3257 ; CM-NEXT: MOV * T1.Y, KC0[4].Z,
3258 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3259 ; CM-NEXT: MOV * T1.X, KC0[4].Y,
3260 ; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
3261 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3263 store <8 x float> %in, ptr addrspace(1) %out, align 4
3267 ; FIXME: Pack/repack on VI
3268 define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
3269 ; SI-LABEL: v16i8_arg:
3270 ; SI: ; %bb.0: ; %entry
3271 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
3272 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3273 ; SI-NEXT: s_mov_b32 s3, 0xf000
3274 ; SI-NEXT: s_mov_b32 s2, -1
3275 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3276 ; SI-NEXT: v_mov_b32_e32 v0, s4
3277 ; SI-NEXT: v_mov_b32_e32 v1, s5
3278 ; SI-NEXT: v_mov_b32_e32 v2, s6
3279 ; SI-NEXT: v_mov_b32_e32 v3, s7
3280 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3283 ; VI-LABEL: v16i8_arg:
3284 ; VI: ; %bb.0: ; %entry
3285 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3286 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
3287 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3288 ; VI-NEXT: v_mov_b32_e32 v4, s4
3289 ; VI-NEXT: v_mov_b32_e32 v0, s0
3290 ; VI-NEXT: v_mov_b32_e32 v5, s5
3291 ; VI-NEXT: v_mov_b32_e32 v1, s1
3292 ; VI-NEXT: v_mov_b32_e32 v2, s2
3293 ; VI-NEXT: v_mov_b32_e32 v3, s3
3294 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3297 ; GFX9-LABEL: v16i8_arg:
3298 ; GFX9: ; %bb.0: ; %entry
3299 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
3300 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
3301 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3303 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
3304 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
3305 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
3306 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3307 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
3308 ; GFX9-NEXT: s_endpgm
3310 ; EG-LABEL: v16i8_arg:
3311 ; EG: ; %bb.0: ; %entry
3312 ; EG-NEXT: ALU 1, @68, KC0[], KC1[]
3313 ; EG-NEXT: TEX 0 @36
3314 ; EG-NEXT: ALU 5, @70, KC0[], KC1[]
3315 ; EG-NEXT: TEX 0 @38
3316 ; EG-NEXT: ALU 5, @76, KC0[], KC1[]
3317 ; EG-NEXT: TEX 0 @40
3318 ; EG-NEXT: ALU 5, @82, KC0[], KC1[]
3319 ; EG-NEXT: TEX 0 @42
3320 ; EG-NEXT: ALU 5, @88, KC0[], KC1[]
3321 ; EG-NEXT: TEX 0 @44
3322 ; EG-NEXT: ALU 7, @94, KC0[], KC1[]
3323 ; EG-NEXT: TEX 0 @46
3324 ; EG-NEXT: ALU 7, @102, KC0[], KC1[]
3325 ; EG-NEXT: TEX 0 @48
3326 ; EG-NEXT: ALU 7, @110, KC0[], KC1[]
3327 ; EG-NEXT: TEX 0 @50
3328 ; EG-NEXT: ALU 7, @118, KC0[], KC1[]
3329 ; EG-NEXT: TEX 0 @52
3330 ; EG-NEXT: ALU 7, @126, KC0[], KC1[]
3331 ; EG-NEXT: TEX 0 @54
3332 ; EG-NEXT: ALU 7, @134, KC0[], KC1[]
3333 ; EG-NEXT: TEX 0 @56
3334 ; EG-NEXT: ALU 7, @142, KC0[], KC1[]
3335 ; EG-NEXT: TEX 0 @58
3336 ; EG-NEXT: ALU 7, @150, KC0[], KC1[]
3337 ; EG-NEXT: TEX 0 @60
3338 ; EG-NEXT: ALU 5, @158, KC0[], KC1[]
3339 ; EG-NEXT: TEX 0 @62
3340 ; EG-NEXT: ALU 5, @164, KC0[], KC1[]
3341 ; EG-NEXT: TEX 0 @64
3342 ; EG-NEXT: ALU 5, @170, KC0[], KC1[]
3343 ; EG-NEXT: TEX 0 @66
3344 ; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
3345 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
3348 ; EG-NEXT: Fetch clause starting at 36:
3349 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
3350 ; EG-NEXT: Fetch clause starting at 38:
3351 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
3352 ; EG-NEXT: Fetch clause starting at 40:
3353 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
3354 ; EG-NEXT: Fetch clause starting at 42:
3355 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
3356 ; EG-NEXT: Fetch clause starting at 44:
3357 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
3358 ; EG-NEXT: Fetch clause starting at 46:
3359 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
3360 ; EG-NEXT: Fetch clause starting at 48:
3361 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
3362 ; EG-NEXT: Fetch clause starting at 50:
3363 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
3364 ; EG-NEXT: Fetch clause starting at 52:
3365 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
3366 ; EG-NEXT: Fetch clause starting at 54:
3367 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
3368 ; EG-NEXT: Fetch clause starting at 56:
3369 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
3370 ; EG-NEXT: Fetch clause starting at 58:
3371 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
3372 ; EG-NEXT: Fetch clause starting at 60:
3373 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
3374 ; EG-NEXT: Fetch clause starting at 62:
3375 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
3376 ; EG-NEXT: Fetch clause starting at 64:
3377 ; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
3378 ; EG-NEXT: Fetch clause starting at 66:
3379 ; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
3380 ; EG-NEXT: ALU clause starting at 68:
3381 ; EG-NEXT: MOV * T0.Y, T2.X,
3382 ; EG-NEXT: MOV * T7.X, 0.0,
3383 ; EG-NEXT: ALU clause starting at 70:
3384 ; EG-NEXT: LSHL T0.W, T8.X, literal.x,
3385 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3386 ; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
3387 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
3388 ; EG-NEXT: MOV T2.X, PV.W,
3389 ; EG-NEXT: MOV * T0.Y, T3.X,
3390 ; EG-NEXT: ALU clause starting at 76:
3391 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3392 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
3393 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3394 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3395 ; EG-NEXT: MOV T3.X, PV.W,
3396 ; EG-NEXT: MOV * T0.Y, T4.X,
3397 ; EG-NEXT: ALU clause starting at 82:
3398 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3399 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
3400 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3401 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3402 ; EG-NEXT: MOV T4.X, PV.W,
3403 ; EG-NEXT: MOV * T0.Y, T5.X,
3404 ; EG-NEXT: ALU clause starting at 88:
3405 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3406 ; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
3407 ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3408 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3409 ; EG-NEXT: MOV T5.X, PV.W,
3410 ; EG-NEXT: MOV * T0.Y, T2.X,
3411 ; EG-NEXT: ALU clause starting at 94:
3412 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3413 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3414 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3415 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3416 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3417 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3418 ; EG-NEXT: MOV T2.X, PV.W,
3419 ; EG-NEXT: MOV * T0.Y, T3.X,
3420 ; EG-NEXT: ALU clause starting at 102:
3421 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3422 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3423 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3424 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3425 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3426 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3427 ; EG-NEXT: MOV T3.X, PV.W,
3428 ; EG-NEXT: MOV * T0.Y, T4.X,
3429 ; EG-NEXT: ALU clause starting at 110:
3430 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3431 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3432 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3433 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3434 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3435 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3436 ; EG-NEXT: MOV T4.X, PV.W,
3437 ; EG-NEXT: MOV * T0.Y, T5.X,
3438 ; EG-NEXT: ALU clause starting at 118:
3439 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3440 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3441 ; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
3442 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3443 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3444 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3445 ; EG-NEXT: MOV T5.X, PV.W,
3446 ; EG-NEXT: MOV * T0.Y, T2.X,
3447 ; EG-NEXT: ALU clause starting at 126:
3448 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3449 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3450 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3451 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3452 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3453 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3454 ; EG-NEXT: MOV T2.X, PV.W,
3455 ; EG-NEXT: MOV * T0.Y, T3.X,
3456 ; EG-NEXT: ALU clause starting at 134:
3457 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3458 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3459 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3460 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3461 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3462 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3463 ; EG-NEXT: MOV T3.X, PV.W,
3464 ; EG-NEXT: MOV * T0.Y, T4.X,
3465 ; EG-NEXT: ALU clause starting at 142:
3466 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3467 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3468 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3469 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3470 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3471 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3472 ; EG-NEXT: MOV T4.X, PV.W,
3473 ; EG-NEXT: MOV * T0.Y, T5.X,
3474 ; EG-NEXT: ALU clause starting at 150:
3475 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3476 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3477 ; EG-NEXT: 255(3.573311e-43), -65281(nan)
3478 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3479 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
3480 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3481 ; EG-NEXT: MOV T5.X, PV.W,
3482 ; EG-NEXT: MOV * T0.Y, T2.X,
3483 ; EG-NEXT: ALU clause starting at 158:
3484 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3485 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
3486 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3487 ; EG-NEXT: OR_INT * T7.W, PV.W, PS,
3488 ; EG-NEXT: MOV T2.X, PV.W,
3489 ; EG-NEXT: MOV * T0.Y, T3.X,
3490 ; EG-NEXT: ALU clause starting at 164:
3491 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3492 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
3493 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3494 ; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
3495 ; EG-NEXT: MOV T3.X, PV.Z,
3496 ; EG-NEXT: MOV * T0.Y, T4.X,
3497 ; EG-NEXT: ALU clause starting at 170:
3498 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3499 ; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
3500 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3501 ; EG-NEXT: OR_INT * T7.Y, PV.W, PS,
3502 ; EG-NEXT: MOV T4.X, PV.Y,
3503 ; EG-NEXT: MOV * T0.Y, T5.X,
3504 ; EG-NEXT: ALU clause starting at 176:
3505 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3506 ; EG-NEXT: AND_INT * T1.W, T7.X, literal.y,
3507 ; EG-NEXT: -256(nan), 255(3.573311e-43)
3508 ; EG-NEXT: OR_INT T7.X, PV.W, PS,
3509 ; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
3510 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3512 ; CM-LABEL: v16i8_arg:
3513 ; CM: ; %bb.0: ; %entry
3514 ; CM-NEXT: ALU 1, @68, KC0[], KC1[]
3515 ; CM-NEXT: TEX 0 @36
3516 ; CM-NEXT: ALU 5, @70, KC0[], KC1[]
3517 ; CM-NEXT: TEX 0 @38
3518 ; CM-NEXT: ALU 5, @76, KC0[], KC1[]
3519 ; CM-NEXT: TEX 0 @40
3520 ; CM-NEXT: ALU 5, @82, KC0[], KC1[]
3521 ; CM-NEXT: TEX 0 @42
3522 ; CM-NEXT: ALU 5, @88, KC0[], KC1[]
3523 ; CM-NEXT: TEX 0 @44
3524 ; CM-NEXT: ALU 7, @94, KC0[], KC1[]
3525 ; CM-NEXT: TEX 0 @46
3526 ; CM-NEXT: ALU 7, @102, KC0[], KC1[]
3527 ; CM-NEXT: TEX 0 @48
3528 ; CM-NEXT: ALU 7, @110, KC0[], KC1[]
3529 ; CM-NEXT: TEX 0 @50
3530 ; CM-NEXT: ALU 7, @118, KC0[], KC1[]
3531 ; CM-NEXT: TEX 0 @52
3532 ; CM-NEXT: ALU 7, @126, KC0[], KC1[]
3533 ; CM-NEXT: TEX 0 @54
3534 ; CM-NEXT: ALU 7, @134, KC0[], KC1[]
3535 ; CM-NEXT: TEX 0 @56
3536 ; CM-NEXT: ALU 7, @142, KC0[], KC1[]
3537 ; CM-NEXT: TEX 0 @58
3538 ; CM-NEXT: ALU 7, @150, KC0[], KC1[]
3539 ; CM-NEXT: TEX 0 @60
3540 ; CM-NEXT: ALU 5, @158, KC0[], KC1[]
3541 ; CM-NEXT: TEX 0 @62
3542 ; CM-NEXT: ALU 5, @164, KC0[], KC1[]
3543 ; CM-NEXT: TEX 0 @64
3544 ; CM-NEXT: ALU 5, @170, KC0[], KC1[]
3545 ; CM-NEXT: TEX 0 @66
3546 ; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
3547 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
3550 ; CM-NEXT: Fetch clause starting at 36:
3551 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
3552 ; CM-NEXT: Fetch clause starting at 38:
3553 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
3554 ; CM-NEXT: Fetch clause starting at 40:
3555 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
3556 ; CM-NEXT: Fetch clause starting at 42:
3557 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
3558 ; CM-NEXT: Fetch clause starting at 44:
3559 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
3560 ; CM-NEXT: Fetch clause starting at 46:
3561 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
3562 ; CM-NEXT: Fetch clause starting at 48:
3563 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
3564 ; CM-NEXT: Fetch clause starting at 50:
3565 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
3566 ; CM-NEXT: Fetch clause starting at 52:
3567 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
3568 ; CM-NEXT: Fetch clause starting at 54:
3569 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
3570 ; CM-NEXT: Fetch clause starting at 56:
3571 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
3572 ; CM-NEXT: Fetch clause starting at 58:
3573 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
3574 ; CM-NEXT: Fetch clause starting at 60:
3575 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
3576 ; CM-NEXT: Fetch clause starting at 62:
3577 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
3578 ; CM-NEXT: Fetch clause starting at 64:
3579 ; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
3580 ; CM-NEXT: Fetch clause starting at 66:
3581 ; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
3582 ; CM-NEXT: ALU clause starting at 68:
3583 ; CM-NEXT: MOV * T0.Y, T2.X,
3584 ; CM-NEXT: MOV * T7.X, 0.0,
3585 ; CM-NEXT: ALU clause starting at 70:
3586 ; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
3587 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
3588 ; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
3589 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
3590 ; CM-NEXT: MOV T2.X, PV.W,
3591 ; CM-NEXT: MOV * T0.Y, T3.X,
3592 ; CM-NEXT: ALU clause starting at 76:
3593 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3594 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3595 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3596 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3597 ; CM-NEXT: MOV T3.X, PV.W,
3598 ; CM-NEXT: MOV * T0.Y, T4.X,
3599 ; CM-NEXT: ALU clause starting at 82:
3600 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3601 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3602 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3603 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3604 ; CM-NEXT: MOV T4.X, PV.W,
3605 ; CM-NEXT: MOV * T0.Y, T5.X,
3606 ; CM-NEXT: ALU clause starting at 88:
3607 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3608 ; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
3609 ; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
3610 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3611 ; CM-NEXT: MOV T5.X, PV.W,
3612 ; CM-NEXT: MOV * T0.Y, T2.X,
3613 ; CM-NEXT: ALU clause starting at 94:
3614 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3615 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3616 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3617 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3618 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3619 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3620 ; CM-NEXT: MOV T2.X, PV.W,
3621 ; CM-NEXT: MOV * T0.Y, T3.X,
3622 ; CM-NEXT: ALU clause starting at 102:
3623 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3624 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3625 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3626 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3627 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3628 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3629 ; CM-NEXT: MOV T3.X, PV.W,
3630 ; CM-NEXT: MOV * T0.Y, T4.X,
3631 ; CM-NEXT: ALU clause starting at 110:
3632 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3633 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3634 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3635 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3636 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3637 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3638 ; CM-NEXT: MOV T4.X, PV.W,
3639 ; CM-NEXT: MOV * T0.Y, T5.X,
3640 ; CM-NEXT: ALU clause starting at 118:
3641 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3642 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3643 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3644 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3645 ; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
3646 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3647 ; CM-NEXT: MOV T5.X, PV.W,
3648 ; CM-NEXT: MOV * T0.Y, T2.X,
3649 ; CM-NEXT: ALU clause starting at 126:
3650 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3651 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3652 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3653 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3654 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3655 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3656 ; CM-NEXT: MOV T2.X, PV.W,
3657 ; CM-NEXT: MOV * T0.Y, T3.X,
3658 ; CM-NEXT: ALU clause starting at 134:
3659 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3660 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3661 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3662 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3663 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3664 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3665 ; CM-NEXT: MOV T3.X, PV.W,
3666 ; CM-NEXT: MOV * T0.Y, T4.X,
3667 ; CM-NEXT: ALU clause starting at 142:
3668 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3669 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3670 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3671 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3672 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3673 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3674 ; CM-NEXT: MOV T4.X, PV.W,
3675 ; CM-NEXT: MOV * T0.Y, T5.X,
3676 ; CM-NEXT: ALU clause starting at 150:
3677 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
3678 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
3679 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3680 ; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
3681 ; CM-NEXT: -65281(nan), 8(1.121039e-44)
3682 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
3683 ; CM-NEXT: MOV T5.X, PV.W,
3684 ; CM-NEXT: MOV * T0.Y, T2.X,
3685 ; CM-NEXT: ALU clause starting at 158:
3686 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3687 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3688 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3689 ; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W,
3690 ; CM-NEXT: MOV T2.X, PV.W,
3691 ; CM-NEXT: MOV * T0.Y, T3.X,
3692 ; CM-NEXT: ALU clause starting at 164:
3693 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3694 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3695 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3696 ; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
3697 ; CM-NEXT: MOV T3.X, PV.Z,
3698 ; CM-NEXT: MOV * T0.Y, T4.X,
3699 ; CM-NEXT: ALU clause starting at 170:
3700 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3701 ; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
3702 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3703 ; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W,
3704 ; CM-NEXT: MOV T4.X, PV.Y,
3705 ; CM-NEXT: MOV * T0.Y, T5.X,
3706 ; CM-NEXT: ALU clause starting at 176:
3707 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
3708 ; CM-NEXT: AND_INT * T0.W, T7.X, literal.y,
3709 ; CM-NEXT: -256(nan), 255(3.573311e-43)
3710 ; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
3711 ; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
3712 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3714 store <16 x i8> %in, ptr addrspace(1) %out
3718 define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
3719 ; SI-LABEL: v16i16_arg:
3720 ; SI: ; %bb.0: ; %entry
3721 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
3722 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3723 ; SI-NEXT: s_mov_b32 s3, 0xf000
3724 ; SI-NEXT: s_mov_b32 s2, -1
3725 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3726 ; SI-NEXT: v_mov_b32_e32 v0, s8
3727 ; SI-NEXT: v_mov_b32_e32 v1, s9
3728 ; SI-NEXT: v_mov_b32_e32 v2, s10
3729 ; SI-NEXT: v_mov_b32_e32 v3, s11
3730 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3731 ; SI-NEXT: s_waitcnt expcnt(0)
3732 ; SI-NEXT: v_mov_b32_e32 v0, s4
3733 ; SI-NEXT: v_mov_b32_e32 v1, s5
3734 ; SI-NEXT: v_mov_b32_e32 v2, s6
3735 ; SI-NEXT: v_mov_b32_e32 v3, s7
3736 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3739 ; VI-LABEL: v16i16_arg:
3740 ; VI: ; %bb.0: ; %entry
3741 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
3742 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3743 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3744 ; VI-NEXT: v_mov_b32_e32 v0, s8
3745 ; VI-NEXT: s_add_u32 s2, s0, 16
3746 ; VI-NEXT: s_addc_u32 s3, s1, 0
3747 ; VI-NEXT: v_mov_b32_e32 v5, s3
3748 ; VI-NEXT: v_mov_b32_e32 v1, s9
3749 ; VI-NEXT: v_mov_b32_e32 v2, s10
3750 ; VI-NEXT: v_mov_b32_e32 v3, s11
3751 ; VI-NEXT: v_mov_b32_e32 v4, s2
3752 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3753 ; VI-NEXT: v_mov_b32_e32 v5, s1
3754 ; VI-NEXT: v_mov_b32_e32 v0, s4
3755 ; VI-NEXT: v_mov_b32_e32 v1, s5
3756 ; VI-NEXT: v_mov_b32_e32 v2, s6
3757 ; VI-NEXT: v_mov_b32_e32 v3, s7
3758 ; VI-NEXT: v_mov_b32_e32 v4, s0
3759 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3762 ; GFX9-LABEL: v16i16_arg:
3763 ; GFX9: ; %bb.0: ; %entry
3764 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
3765 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3766 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3767 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3768 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
3769 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
3770 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
3771 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
3772 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3773 ; GFX9-NEXT: s_nop 0
3774 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
3775 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
3776 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
3777 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
3778 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3779 ; GFX9-NEXT: s_endpgm
3781 ; EG-LABEL: v16i16_arg:
3782 ; EG: ; %bb.0: ; %entry
3783 ; EG-NEXT: ALU 1, @68, KC0[], KC1[]
3784 ; EG-NEXT: TEX 0 @36
3785 ; EG-NEXT: ALU 5, @70, KC0[], KC1[]
3786 ; EG-NEXT: TEX 0 @38
3787 ; EG-NEXT: ALU 5, @76, KC0[], KC1[]
3788 ; EG-NEXT: TEX 0 @40
3789 ; EG-NEXT: ALU 5, @82, KC0[], KC1[]
3790 ; EG-NEXT: TEX 0 @42
3791 ; EG-NEXT: ALU 5, @88, KC0[], KC1[]
3792 ; EG-NEXT: TEX 0 @44
3793 ; EG-NEXT: ALU 5, @94, KC0[], KC1[]
3794 ; EG-NEXT: TEX 0 @46
3795 ; EG-NEXT: ALU 5, @100, KC0[], KC1[]
3796 ; EG-NEXT: TEX 0 @48
3797 ; EG-NEXT: ALU 5, @106, KC0[], KC1[]
3798 ; EG-NEXT: TEX 0 @50
3799 ; EG-NEXT: ALU 5, @112, KC0[], KC1[]
3800 ; EG-NEXT: TEX 0 @52
3801 ; EG-NEXT: ALU 5, @118, KC0[], KC1[]
3802 ; EG-NEXT: TEX 0 @54
3803 ; EG-NEXT: ALU 5, @124, KC0[], KC1[]
3804 ; EG-NEXT: TEX 0 @56
3805 ; EG-NEXT: ALU 5, @130, KC0[], KC1[]
3806 ; EG-NEXT: TEX 0 @58
3807 ; EG-NEXT: ALU 5, @136, KC0[], KC1[]
3808 ; EG-NEXT: TEX 0 @60
3809 ; EG-NEXT: ALU 5, @142, KC0[], KC1[]
3810 ; EG-NEXT: TEX 0 @62
3811 ; EG-NEXT: ALU 5, @148, KC0[], KC1[]
3812 ; EG-NEXT: TEX 0 @64
3813 ; EG-NEXT: ALU 5, @154, KC0[], KC1[]
3814 ; EG-NEXT: TEX 0 @66
3815 ; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[]
3816 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
3817 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
3819 ; EG-NEXT: Fetch clause starting at 36:
3820 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
3821 ; EG-NEXT: Fetch clause starting at 38:
3822 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
3823 ; EG-NEXT: Fetch clause starting at 40:
3824 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
3825 ; EG-NEXT: Fetch clause starting at 42:
3826 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
3827 ; EG-NEXT: Fetch clause starting at 44:
3828 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
3829 ; EG-NEXT: Fetch clause starting at 46:
3830 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
3831 ; EG-NEXT: Fetch clause starting at 48:
3832 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
3833 ; EG-NEXT: Fetch clause starting at 50:
3834 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
3835 ; EG-NEXT: Fetch clause starting at 52:
3836 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
3837 ; EG-NEXT: Fetch clause starting at 54:
3838 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
3839 ; EG-NEXT: Fetch clause starting at 56:
3840 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
3841 ; EG-NEXT: Fetch clause starting at 58:
3842 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
3843 ; EG-NEXT: Fetch clause starting at 60:
3844 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
3845 ; EG-NEXT: Fetch clause starting at 62:
3846 ; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
3847 ; EG-NEXT: Fetch clause starting at 64:
3848 ; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
3849 ; EG-NEXT: Fetch clause starting at 66:
3850 ; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
3851 ; EG-NEXT: ALU clause starting at 68:
3852 ; EG-NEXT: MOV * T0.Y, T3.X,
3853 ; EG-NEXT: MOV * T11.X, 0.0,
3854 ; EG-NEXT: ALU clause starting at 70:
3855 ; EG-NEXT: LSHL T0.W, T12.X, literal.x,
3856 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3857 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
3858 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
3859 ; EG-NEXT: MOV T3.X, PV.W,
3860 ; EG-NEXT: MOV * T0.Y, T5.X,
3861 ; EG-NEXT: ALU clause starting at 76:
3862 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3863 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3864 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3865 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3866 ; EG-NEXT: MOV T5.X, PV.W,
3867 ; EG-NEXT: MOV * T0.Y, T7.X,
3868 ; EG-NEXT: ALU clause starting at 82:
3869 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3870 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3871 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3872 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3873 ; EG-NEXT: MOV T7.X, PV.W,
3874 ; EG-NEXT: MOV * T0.Y, T9.X,
3875 ; EG-NEXT: ALU clause starting at 88:
3876 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3877 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3878 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3879 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3880 ; EG-NEXT: MOV T9.X, PV.W,
3881 ; EG-NEXT: MOV * T0.Y, T3.X,
3882 ; EG-NEXT: ALU clause starting at 94:
3883 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3884 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3885 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3886 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3887 ; EG-NEXT: MOV T3.X, PV.W,
3888 ; EG-NEXT: MOV * T0.Y, T5.X,
3889 ; EG-NEXT: ALU clause starting at 100:
3890 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3891 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3892 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3893 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3894 ; EG-NEXT: MOV T5.X, PV.W,
3895 ; EG-NEXT: MOV * T0.Y, T7.X,
3896 ; EG-NEXT: ALU clause starting at 106:
3897 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3898 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3899 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3900 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3901 ; EG-NEXT: MOV T7.X, PV.W,
3902 ; EG-NEXT: MOV * T0.Y, T9.X,
3903 ; EG-NEXT: ALU clause starting at 112:
3904 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3905 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3906 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3907 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3908 ; EG-NEXT: MOV T9.X, PV.W,
3909 ; EG-NEXT: MOV * T0.Y, T2.X,
3910 ; EG-NEXT: ALU clause starting at 118:
3911 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3912 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3913 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3914 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3915 ; EG-NEXT: MOV T2.X, PV.W,
3916 ; EG-NEXT: MOV * T0.Y, T4.X,
3917 ; EG-NEXT: ALU clause starting at 124:
3918 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3919 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3920 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3921 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3922 ; EG-NEXT: MOV T4.X, PV.W,
3923 ; EG-NEXT: MOV * T0.Y, T6.X,
3924 ; EG-NEXT: ALU clause starting at 130:
3925 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3926 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3927 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3928 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3929 ; EG-NEXT: MOV T6.X, PV.W,
3930 ; EG-NEXT: MOV * T0.Y, T8.X,
3931 ; EG-NEXT: ALU clause starting at 136:
3932 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3933 ; EG-NEXT: LSHL * T1.W, T12.X, literal.y,
3934 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
3935 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3936 ; EG-NEXT: MOV T8.X, PV.W,
3937 ; EG-NEXT: MOV * T0.Y, T2.X,
3938 ; EG-NEXT: ALU clause starting at 142:
3939 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3940 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3941 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3942 ; EG-NEXT: OR_INT * T12.Z, PV.W, PS,
3943 ; EG-NEXT: MOV T2.X, PV.Z,
3944 ; EG-NEXT: MOV * T0.Y, T4.X,
3945 ; EG-NEXT: ALU clause starting at 148:
3946 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3947 ; EG-NEXT: AND_INT * T1.W, T12.X, literal.y,
3948 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3949 ; EG-NEXT: OR_INT * T12.X, PV.W, PS,
3950 ; EG-NEXT: MOV T4.X, PV.X,
3951 ; EG-NEXT: MOV * T0.Y, T6.X,
3952 ; EG-NEXT: ALU clause starting at 154:
3953 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
3954 ; EG-NEXT: AND_INT * T1.W, T13.X, literal.y,
3955 ; EG-NEXT: -65536(nan), 65535(9.183409e-41)
3956 ; EG-NEXT: OR_INT * T11.Z, PV.W, PS,
3957 ; EG-NEXT: MOV T6.X, PV.Z,
3958 ; EG-NEXT: MOV * T0.Y, T8.X,
3959 ; EG-NEXT: ALU clause starting at 160:
3960 ; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
3961 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
3962 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
3963 ; EG-NEXT: LSHR T14.X, PV.W, literal.x,
3964 ; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
3965 ; EG-NEXT: AND_INT * T1.W, T11.X, literal.z,
3966 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
3967 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3968 ; EG-NEXT: OR_INT * T11.X, PV.W, PS,
3969 ; EG-NEXT: MOV T8.X, PV.X,
3970 ; EG-NEXT: MOV * T12.W, T3.X,
3971 ; EG-NEXT: MOV T12.Y, T5.X,
3972 ; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212
3973 ; EG-NEXT: MOV * T11.Y, T9.X,
3975 ; CM-LABEL: v16i16_arg:
3976 ; CM: ; %bb.0: ; %entry
3977 ; CM-NEXT: ALU 1, @68, KC0[], KC1[]
3978 ; CM-NEXT: TEX 0 @36
3979 ; CM-NEXT: ALU 5, @70, KC0[], KC1[]
3980 ; CM-NEXT: TEX 0 @38
3981 ; CM-NEXT: ALU 5, @76, KC0[], KC1[]
3982 ; CM-NEXT: TEX 0 @40
3983 ; CM-NEXT: ALU 5, @82, KC0[], KC1[]
3984 ; CM-NEXT: TEX 0 @42
3985 ; CM-NEXT: ALU 5, @88, KC0[], KC1[]
3986 ; CM-NEXT: TEX 0 @44
3987 ; CM-NEXT: ALU 5, @94, KC0[], KC1[]
3988 ; CM-NEXT: TEX 0 @46
3989 ; CM-NEXT: ALU 5, @100, KC0[], KC1[]
3990 ; CM-NEXT: TEX 0 @48
3991 ; CM-NEXT: ALU 5, @106, KC0[], KC1[]
3992 ; CM-NEXT: TEX 0 @50
3993 ; CM-NEXT: ALU 5, @112, KC0[], KC1[]
3994 ; CM-NEXT: TEX 0 @52
3995 ; CM-NEXT: ALU 5, @118, KC0[], KC1[]
3996 ; CM-NEXT: TEX 0 @54
3997 ; CM-NEXT: ALU 5, @124, KC0[], KC1[]
3998 ; CM-NEXT: TEX 0 @56
3999 ; CM-NEXT: ALU 5, @130, KC0[], KC1[]
4000 ; CM-NEXT: TEX 0 @58
4001 ; CM-NEXT: ALU 5, @136, KC0[], KC1[]
4002 ; CM-NEXT: TEX 0 @60
4003 ; CM-NEXT: ALU 5, @142, KC0[], KC1[]
4004 ; CM-NEXT: TEX 0 @62
4005 ; CM-NEXT: ALU 5, @148, KC0[], KC1[]
4006 ; CM-NEXT: TEX 0 @64
4007 ; CM-NEXT: ALU 5, @154, KC0[], KC1[]
4008 ; CM-NEXT: TEX 0 @66
4009 ; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[]
4010 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
4011 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
4013 ; CM-NEXT: Fetch clause starting at 36:
4014 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3
4015 ; CM-NEXT: Fetch clause starting at 38:
4016 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3
4017 ; CM-NEXT: Fetch clause starting at 40:
4018 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3
4019 ; CM-NEXT: Fetch clause starting at 42:
4020 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3
4021 ; CM-NEXT: Fetch clause starting at 44:
4022 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3
4023 ; CM-NEXT: Fetch clause starting at 46:
4024 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3
4025 ; CM-NEXT: Fetch clause starting at 48:
4026 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3
4027 ; CM-NEXT: Fetch clause starting at 50:
4028 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3
4029 ; CM-NEXT: Fetch clause starting at 52:
4030 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3
4031 ; CM-NEXT: Fetch clause starting at 54:
4032 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3
4033 ; CM-NEXT: Fetch clause starting at 56:
4034 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3
4035 ; CM-NEXT: Fetch clause starting at 58:
4036 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3
4037 ; CM-NEXT: Fetch clause starting at 60:
4038 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3
4039 ; CM-NEXT: Fetch clause starting at 62:
4040 ; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3
4041 ; CM-NEXT: Fetch clause starting at 64:
4042 ; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3
4043 ; CM-NEXT: Fetch clause starting at 66:
4044 ; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3
4045 ; CM-NEXT: ALU clause starting at 68:
4046 ; CM-NEXT: MOV * T0.Y, T3.X,
4047 ; CM-NEXT: MOV * T11.X, 0.0,
4048 ; CM-NEXT: ALU clause starting at 70:
4049 ; CM-NEXT: LSHL T0.Z, T12.X, literal.x,
4050 ; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
4051 ; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
4052 ; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
4053 ; CM-NEXT: MOV T3.X, PV.W,
4054 ; CM-NEXT: MOV * T0.Y, T5.X,
4055 ; CM-NEXT: ALU clause starting at 76:
4056 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4057 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
4058 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
4059 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4060 ; CM-NEXT: MOV T5.X, PV.W,
4061 ; CM-NEXT: MOV * T0.Y, T7.X,
4062 ; CM-NEXT: ALU clause starting at 82:
4063 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4064 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
4065 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
4066 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4067 ; CM-NEXT: MOV T7.X, PV.W,
4068 ; CM-NEXT: MOV * T0.Y, T9.X,
4069 ; CM-NEXT: ALU clause starting at 88:
4070 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4071 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
4072 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
4073 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4074 ; CM-NEXT: MOV T9.X, PV.W,
4075 ; CM-NEXT: MOV * T0.Y, T3.X,
4076 ; CM-NEXT: ALU clause starting at 94:
4077 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4078 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
4079 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
4080 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4081 ; CM-NEXT: MOV T3.X, PV.W,
4082 ; CM-NEXT: MOV * T0.Y, T5.X,
4083 ; CM-NEXT: ALU clause starting at 100:
4084 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4085 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
4086 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
4087 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4088 ; CM-NEXT: MOV T5.X, PV.W,
4089 ; CM-NEXT: MOV * T0.Y, T7.X,
4090 ; CM-NEXT: ALU clause starting at 106:
4091 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4092 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
4093 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
4094 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4095 ; CM-NEXT: MOV T7.X, PV.W,
4096 ; CM-NEXT: MOV * T0.Y, T9.X,
4097 ; CM-NEXT: ALU clause starting at 112:
4098 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4099 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
4100 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
4101 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4102 ; CM-NEXT: MOV T9.X, PV.W,
4103 ; CM-NEXT: MOV * T0.Y, T2.X,
4104 ; CM-NEXT: ALU clause starting at 118:
4105 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4106 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
4107 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
4108 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4109 ; CM-NEXT: MOV T2.X, PV.W,
4110 ; CM-NEXT: MOV * T0.Y, T4.X,
4111 ; CM-NEXT: ALU clause starting at 124:
4112 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4113 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
4114 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
4115 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4116 ; CM-NEXT: MOV T4.X, PV.W,
4117 ; CM-NEXT: MOV * T0.Y, T6.X,
4118 ; CM-NEXT: ALU clause starting at 130:
4119 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4120 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
4121 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
4122 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4123 ; CM-NEXT: MOV T6.X, PV.W,
4124 ; CM-NEXT: MOV * T0.Y, T8.X,
4125 ; CM-NEXT: ALU clause starting at 136:
4126 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4127 ; CM-NEXT: LSHL * T0.W, T12.X, literal.y,
4128 ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
4129 ; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
4130 ; CM-NEXT: MOV T8.X, PV.W,
4131 ; CM-NEXT: MOV * T0.Y, T2.X,
4132 ; CM-NEXT: ALU clause starting at 142:
4133 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4134 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
4135 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
4136 ; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W,
4137 ; CM-NEXT: MOV T2.X, PV.Z,
4138 ; CM-NEXT: MOV * T0.Y, T4.X,
4139 ; CM-NEXT: ALU clause starting at 148:
4140 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4141 ; CM-NEXT: AND_INT * T0.W, T12.X, literal.y,
4142 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
4143 ; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W,
4144 ; CM-NEXT: MOV T4.X, PV.X,
4145 ; CM-NEXT: MOV * T0.Y, T6.X,
4146 ; CM-NEXT: ALU clause starting at 154:
4147 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
4148 ; CM-NEXT: AND_INT * T0.W, T13.X, literal.y,
4149 ; CM-NEXT: -65536(nan), 65535(9.183409e-41)
4150 ; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W,
4151 ; CM-NEXT: MOV T6.X, PV.Z,
4152 ; CM-NEXT: MOV * T0.Y, T8.X,
4153 ; CM-NEXT: ALU clause starting at 160:
4154 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4155 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4156 ; CM-NEXT: LSHR * T13.X, PV.W, literal.x,
4157 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4158 ; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
4159 ; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y,
4160 ; CM-NEXT: AND_INT * T0.W, T11.X, literal.z,
4161 ; CM-NEXT: 2(2.802597e-45), -65536(nan)
4162 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
4163 ; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W,
4164 ; CM-NEXT: MOV T8.X, PV.X,
4165 ; CM-NEXT: MOV * T12.W, T3.X,
4166 ; CM-NEXT: MOV T12.Y, T5.X,
4167 ; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212
4168 ; CM-NEXT: MOV * T11.Y, T9.X,
4170 store <16 x i16> %in, ptr addrspace(1) %out
4174 define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind {
4175 ; SI-LABEL: v16i32_arg:
4176 ; SI: ; %bb.0: ; %entry
4177 ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19
4178 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4179 ; SI-NEXT: s_mov_b32 s3, 0xf000
4180 ; SI-NEXT: s_mov_b32 s2, -1
4181 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4182 ; SI-NEXT: v_mov_b32_e32 v0, s16
4183 ; SI-NEXT: v_mov_b32_e32 v1, s17
4184 ; SI-NEXT: v_mov_b32_e32 v2, s18
4185 ; SI-NEXT: v_mov_b32_e32 v3, s19
4186 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4187 ; SI-NEXT: s_waitcnt expcnt(0)
4188 ; SI-NEXT: v_mov_b32_e32 v0, s12
4189 ; SI-NEXT: v_mov_b32_e32 v1, s13
4190 ; SI-NEXT: v_mov_b32_e32 v2, s14
4191 ; SI-NEXT: v_mov_b32_e32 v3, s15
4192 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4193 ; SI-NEXT: s_waitcnt expcnt(0)
4194 ; SI-NEXT: v_mov_b32_e32 v0, s8
4195 ; SI-NEXT: v_mov_b32_e32 v1, s9
4196 ; SI-NEXT: v_mov_b32_e32 v2, s10
4197 ; SI-NEXT: v_mov_b32_e32 v3, s11
4198 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4199 ; SI-NEXT: s_waitcnt expcnt(0)
4200 ; SI-NEXT: v_mov_b32_e32 v0, s4
4201 ; SI-NEXT: v_mov_b32_e32 v1, s5
4202 ; SI-NEXT: v_mov_b32_e32 v2, s6
4203 ; SI-NEXT: v_mov_b32_e32 v3, s7
4204 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4207 ; VI-LABEL: v16i32_arg:
4208 ; VI: ; %bb.0: ; %entry
4209 ; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
4210 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
4211 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4212 ; VI-NEXT: v_mov_b32_e32 v0, s16
4213 ; VI-NEXT: s_add_u32 s2, s0, 48
4214 ; VI-NEXT: s_addc_u32 s3, s1, 0
4215 ; VI-NEXT: v_mov_b32_e32 v5, s3
4216 ; VI-NEXT: v_mov_b32_e32 v4, s2
4217 ; VI-NEXT: s_add_u32 s2, s0, 32
4218 ; VI-NEXT: v_mov_b32_e32 v1, s17
4219 ; VI-NEXT: v_mov_b32_e32 v2, s18
4220 ; VI-NEXT: v_mov_b32_e32 v3, s19
4221 ; VI-NEXT: s_addc_u32 s3, s1, 0
4222 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4223 ; VI-NEXT: v_mov_b32_e32 v5, s3
4224 ; VI-NEXT: v_mov_b32_e32 v4, s2
4225 ; VI-NEXT: s_add_u32 s2, s0, 16
4226 ; VI-NEXT: v_mov_b32_e32 v0, s12
4227 ; VI-NEXT: v_mov_b32_e32 v1, s13
4228 ; VI-NEXT: v_mov_b32_e32 v2, s14
4229 ; VI-NEXT: v_mov_b32_e32 v3, s15
4230 ; VI-NEXT: s_addc_u32 s3, s1, 0
4231 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4232 ; VI-NEXT: v_mov_b32_e32 v5, s3
4233 ; VI-NEXT: v_mov_b32_e32 v0, s8
4234 ; VI-NEXT: v_mov_b32_e32 v1, s9
4235 ; VI-NEXT: v_mov_b32_e32 v2, s10
4236 ; VI-NEXT: v_mov_b32_e32 v3, s11
4237 ; VI-NEXT: v_mov_b32_e32 v4, s2
4238 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4239 ; VI-NEXT: v_mov_b32_e32 v5, s1
4240 ; VI-NEXT: v_mov_b32_e32 v0, s4
4241 ; VI-NEXT: v_mov_b32_e32 v1, s5
4242 ; VI-NEXT: v_mov_b32_e32 v2, s6
4243 ; VI-NEXT: v_mov_b32_e32 v3, s7
4244 ; VI-NEXT: v_mov_b32_e32 v4, s0
4245 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4248 ; GFX9-LABEL: v16i32_arg:
4249 ; GFX9: ; %bb.0: ; %entry
4250 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
4251 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4252 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4253 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4254 ; GFX9-NEXT: v_mov_b32_e32 v0, s20
4255 ; GFX9-NEXT: v_mov_b32_e32 v1, s21
4256 ; GFX9-NEXT: v_mov_b32_e32 v2, s22
4257 ; GFX9-NEXT: v_mov_b32_e32 v3, s23
4258 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4259 ; GFX9-NEXT: s_nop 0
4260 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
4261 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
4262 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
4263 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
4264 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4265 ; GFX9-NEXT: s_nop 0
4266 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
4267 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
4268 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
4269 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
4270 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4271 ; GFX9-NEXT: s_nop 0
4272 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
4273 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
4274 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
4275 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
4276 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
4277 ; GFX9-NEXT: s_endpgm
4279 ; EG-LABEL: v16i32_arg:
4280 ; EG: ; %bb.0: ; %entry
4281 ; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[]
4282 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4283 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4284 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4285 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4287 ; EG-NEXT: ALU clause starting at 6:
4288 ; EG-NEXT: MOV * T0.W, KC0[7].X,
4289 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
4290 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
4291 ; EG-NEXT: MOV * T1.W, KC0[8].X,
4292 ; EG-NEXT: MOV T0.X, KC0[6].Y,
4293 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
4294 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
4295 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
4296 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4297 ; EG-NEXT: MOV * T3.W, KC0[9].X,
4298 ; EG-NEXT: MOV T1.X, KC0[7].Y,
4299 ; EG-NEXT: MOV * T3.Z, KC0[8].W,
4300 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4301 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4302 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
4303 ; EG-NEXT: MOV T3.Y, KC0[8].Z,
4304 ; EG-NEXT: MOV * T5.W, KC0[10].X,
4305 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4306 ; EG-NEXT: MOV T3.X, KC0[8].Y,
4307 ; EG-NEXT: MOV * T5.Z, KC0[9].W,
4308 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4309 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4310 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
4311 ; EG-NEXT: MOV T5.Y, KC0[9].Z,
4312 ; EG-NEXT: MOV * T5.X, KC0[9].Y,
4313 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4314 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4315 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4316 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4317 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4319 ; CM-LABEL: v16i32_arg:
4320 ; CM: ; %bb.0: ; %entry
4321 ; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[]
4322 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4323 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4324 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4325 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4327 ; CM-NEXT: ALU clause starting at 6:
4328 ; CM-NEXT: MOV * T0.W, KC0[10].X,
4329 ; CM-NEXT: MOV * T0.Z, KC0[9].W,
4330 ; CM-NEXT: MOV * T0.Y, KC0[9].Z,
4331 ; CM-NEXT: MOV T0.X, KC0[9].Y,
4332 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
4333 ; CM-NEXT: MOV * T2.W, KC0[9].X,
4334 ; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4335 ; CM-NEXT: MOV T2.Z, KC0[8].W,
4336 ; CM-NEXT: MOV * T1.W, KC0[8].X,
4337 ; CM-NEXT: LSHR T3.X, T1.Z, literal.x,
4338 ; CM-NEXT: MOV T2.Y, KC0[8].Z,
4339 ; CM-NEXT: MOV * T1.Z, KC0[7].W,
4340 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4341 ; CM-NEXT: MOV T2.X, KC0[8].Y,
4342 ; CM-NEXT: MOV * T1.Y, KC0[7].Z,
4343 ; CM-NEXT: MOV T1.X, KC0[7].Y,
4344 ; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x,
4345 ; CM-NEXT: MOV * T4.W, KC0[7].X,
4346 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4347 ; CM-NEXT: LSHR T5.X, PV.Z, literal.x,
4348 ; CM-NEXT: MOV T4.Z, KC0[6].W,
4349 ; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
4350 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
4351 ; CM-NEXT: LSHR T6.X, PV.W, literal.x,
4352 ; CM-NEXT: MOV * T4.Y, KC0[6].Z,
4353 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4354 ; CM-NEXT: MOV * T4.X, KC0[6].Y,
4355 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
4356 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4358 store <16 x i32> %in, ptr addrspace(1) %out, align 4
4362 define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind {
4363 ; SI-LABEL: v16f32_arg:
4364 ; SI: ; %bb.0: ; %entry
4365 ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19
4366 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4367 ; SI-NEXT: s_mov_b32 s3, 0xf000
4368 ; SI-NEXT: s_mov_b32 s2, -1
4369 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4370 ; SI-NEXT: v_mov_b32_e32 v0, s16
4371 ; SI-NEXT: v_mov_b32_e32 v1, s17
4372 ; SI-NEXT: v_mov_b32_e32 v2, s18
4373 ; SI-NEXT: v_mov_b32_e32 v3, s19
4374 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4375 ; SI-NEXT: s_waitcnt expcnt(0)
4376 ; SI-NEXT: v_mov_b32_e32 v0, s12
4377 ; SI-NEXT: v_mov_b32_e32 v1, s13
4378 ; SI-NEXT: v_mov_b32_e32 v2, s14
4379 ; SI-NEXT: v_mov_b32_e32 v3, s15
4380 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4381 ; SI-NEXT: s_waitcnt expcnt(0)
4382 ; SI-NEXT: v_mov_b32_e32 v0, s8
4383 ; SI-NEXT: v_mov_b32_e32 v1, s9
4384 ; SI-NEXT: v_mov_b32_e32 v2, s10
4385 ; SI-NEXT: v_mov_b32_e32 v3, s11
4386 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4387 ; SI-NEXT: s_waitcnt expcnt(0)
4388 ; SI-NEXT: v_mov_b32_e32 v0, s4
4389 ; SI-NEXT: v_mov_b32_e32 v1, s5
4390 ; SI-NEXT: v_mov_b32_e32 v2, s6
4391 ; SI-NEXT: v_mov_b32_e32 v3, s7
4392 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4395 ; VI-LABEL: v16f32_arg:
4396 ; VI: ; %bb.0: ; %entry
4397 ; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
4398 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
4399 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4400 ; VI-NEXT: v_mov_b32_e32 v0, s16
4401 ; VI-NEXT: s_add_u32 s2, s0, 48
4402 ; VI-NEXT: s_addc_u32 s3, s1, 0
4403 ; VI-NEXT: v_mov_b32_e32 v5, s3
4404 ; VI-NEXT: v_mov_b32_e32 v4, s2
4405 ; VI-NEXT: s_add_u32 s2, s0, 32
4406 ; VI-NEXT: v_mov_b32_e32 v1, s17
4407 ; VI-NEXT: v_mov_b32_e32 v2, s18
4408 ; VI-NEXT: v_mov_b32_e32 v3, s19
4409 ; VI-NEXT: s_addc_u32 s3, s1, 0
4410 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4411 ; VI-NEXT: v_mov_b32_e32 v5, s3
4412 ; VI-NEXT: v_mov_b32_e32 v4, s2
4413 ; VI-NEXT: s_add_u32 s2, s0, 16
4414 ; VI-NEXT: v_mov_b32_e32 v0, s12
4415 ; VI-NEXT: v_mov_b32_e32 v1, s13
4416 ; VI-NEXT: v_mov_b32_e32 v2, s14
4417 ; VI-NEXT: v_mov_b32_e32 v3, s15
4418 ; VI-NEXT: s_addc_u32 s3, s1, 0
4419 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4420 ; VI-NEXT: v_mov_b32_e32 v5, s3
4421 ; VI-NEXT: v_mov_b32_e32 v0, s8
4422 ; VI-NEXT: v_mov_b32_e32 v1, s9
4423 ; VI-NEXT: v_mov_b32_e32 v2, s10
4424 ; VI-NEXT: v_mov_b32_e32 v3, s11
4425 ; VI-NEXT: v_mov_b32_e32 v4, s2
4426 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4427 ; VI-NEXT: v_mov_b32_e32 v5, s1
4428 ; VI-NEXT: v_mov_b32_e32 v0, s4
4429 ; VI-NEXT: v_mov_b32_e32 v1, s5
4430 ; VI-NEXT: v_mov_b32_e32 v2, s6
4431 ; VI-NEXT: v_mov_b32_e32 v3, s7
4432 ; VI-NEXT: v_mov_b32_e32 v4, s0
4433 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
4436 ; GFX9-LABEL: v16f32_arg:
4437 ; GFX9: ; %bb.0: ; %entry
4438 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
4439 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4440 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4441 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4442 ; GFX9-NEXT: v_mov_b32_e32 v0, s20
4443 ; GFX9-NEXT: v_mov_b32_e32 v1, s21
4444 ; GFX9-NEXT: v_mov_b32_e32 v2, s22
4445 ; GFX9-NEXT: v_mov_b32_e32 v3, s23
4446 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4447 ; GFX9-NEXT: s_nop 0
4448 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
4449 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
4450 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
4451 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
4452 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4453 ; GFX9-NEXT: s_nop 0
4454 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
4455 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
4456 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
4457 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
4458 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4459 ; GFX9-NEXT: s_nop 0
4460 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
4461 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
4462 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
4463 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
4464 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
4465 ; GFX9-NEXT: s_endpgm
4467 ; EG-LABEL: v16f32_arg:
4468 ; EG: ; %bb.0: ; %entry
4469 ; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[]
4470 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4471 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4472 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4473 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4475 ; EG-NEXT: ALU clause starting at 6:
4476 ; EG-NEXT: MOV * T0.W, KC0[7].X,
4477 ; EG-NEXT: MOV * T0.Z, KC0[6].W,
4478 ; EG-NEXT: MOV T0.Y, KC0[6].Z,
4479 ; EG-NEXT: MOV * T1.W, KC0[8].X,
4480 ; EG-NEXT: MOV T0.X, KC0[6].Y,
4481 ; EG-NEXT: MOV * T1.Z, KC0[7].W,
4482 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
4483 ; EG-NEXT: MOV * T1.Y, KC0[7].Z,
4484 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4485 ; EG-NEXT: MOV * T3.W, KC0[9].X,
4486 ; EG-NEXT: MOV T1.X, KC0[7].Y,
4487 ; EG-NEXT: MOV * T3.Z, KC0[8].W,
4488 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4489 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4490 ; EG-NEXT: LSHR T4.X, PV.W, literal.x,
4491 ; EG-NEXT: MOV T3.Y, KC0[8].Z,
4492 ; EG-NEXT: MOV * T5.W, KC0[10].X,
4493 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4494 ; EG-NEXT: MOV T3.X, KC0[8].Y,
4495 ; EG-NEXT: MOV * T5.Z, KC0[9].W,
4496 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4497 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4498 ; EG-NEXT: LSHR T6.X, PV.W, literal.x,
4499 ; EG-NEXT: MOV T5.Y, KC0[9].Z,
4500 ; EG-NEXT: MOV * T5.X, KC0[9].Y,
4501 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4502 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4503 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4504 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4505 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4507 ; CM-LABEL: v16f32_arg:
4508 ; CM: ; %bb.0: ; %entry
4509 ; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[]
4510 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4511 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4512 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4513 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4515 ; CM-NEXT: ALU clause starting at 6:
4516 ; CM-NEXT: MOV * T0.W, KC0[10].X,
4517 ; CM-NEXT: MOV * T0.Z, KC0[9].W,
4518 ; CM-NEXT: MOV * T0.Y, KC0[9].Z,
4519 ; CM-NEXT: MOV T0.X, KC0[9].Y,
4520 ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
4521 ; CM-NEXT: MOV * T2.W, KC0[9].X,
4522 ; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
4523 ; CM-NEXT: MOV T2.Z, KC0[8].W,
4524 ; CM-NEXT: MOV * T1.W, KC0[8].X,
4525 ; CM-NEXT: LSHR T3.X, T1.Z, literal.x,
4526 ; CM-NEXT: MOV T2.Y, KC0[8].Z,
4527 ; CM-NEXT: MOV * T1.Z, KC0[7].W,
4528 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4529 ; CM-NEXT: MOV T2.X, KC0[8].Y,
4530 ; CM-NEXT: MOV * T1.Y, KC0[7].Z,
4531 ; CM-NEXT: MOV T1.X, KC0[7].Y,
4532 ; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x,
4533 ; CM-NEXT: MOV * T4.W, KC0[7].X,
4534 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
4535 ; CM-NEXT: LSHR T5.X, PV.Z, literal.x,
4536 ; CM-NEXT: MOV T4.Z, KC0[6].W,
4537 ; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
4538 ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
4539 ; CM-NEXT: LSHR T6.X, PV.W, literal.x,
4540 ; CM-NEXT: MOV * T4.Y, KC0[6].Z,
4541 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4542 ; CM-NEXT: MOV * T4.X, KC0[6].Y,
4543 ; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
4544 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4546 store <16 x float> %in, ptr addrspace(1) %out, align 4
4550 define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind {
4551 ; SI-LABEL: kernel_arg_i64:
4553 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4554 ; SI-NEXT: s_mov_b32 s7, 0xf000
4555 ; SI-NEXT: s_mov_b32 s6, -1
4556 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4557 ; SI-NEXT: s_mov_b32 s4, s0
4558 ; SI-NEXT: s_mov_b32 s5, s1
4559 ; SI-NEXT: v_mov_b32_e32 v0, s2
4560 ; SI-NEXT: v_mov_b32_e32 v1, s3
4561 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4564 ; VI-LABEL: kernel_arg_i64:
4566 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4567 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4568 ; VI-NEXT: v_mov_b32_e32 v0, s0
4569 ; VI-NEXT: v_mov_b32_e32 v1, s1
4570 ; VI-NEXT: v_mov_b32_e32 v2, s2
4571 ; VI-NEXT: v_mov_b32_e32 v3, s3
4572 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
4575 ; GFX9-LABEL: kernel_arg_i64:
4577 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4578 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4579 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4580 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4581 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4582 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
4583 ; GFX9-NEXT: s_endpgm
4585 ; EG-LABEL: kernel_arg_i64:
4587 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4588 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4591 ; EG-NEXT: ALU clause starting at 4:
4592 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
4593 ; EG-NEXT: MOV T0.X, KC0[2].W,
4594 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4595 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4597 ; CM-LABEL: kernel_arg_i64:
4599 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4600 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4603 ; CM-NEXT: ALU clause starting at 4:
4604 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
4605 ; CM-NEXT: MOV * T0.X, KC0[2].W,
4606 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4607 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4608 store i64 %a, ptr addrspace(1) %out, align 8
4612 define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
4613 ; SI-LABEL: f64_kernel_arg:
4614 ; SI: ; %bb.0: ; %entry
4615 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4616 ; SI-NEXT: s_mov_b32 s7, 0xf000
4617 ; SI-NEXT: s_mov_b32 s6, -1
4618 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4619 ; SI-NEXT: s_mov_b32 s4, s0
4620 ; SI-NEXT: s_mov_b32 s5, s1
4621 ; SI-NEXT: v_mov_b32_e32 v0, s2
4622 ; SI-NEXT: v_mov_b32_e32 v1, s3
4623 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4626 ; VI-LABEL: f64_kernel_arg:
4627 ; VI: ; %bb.0: ; %entry
4628 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4629 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4630 ; VI-NEXT: v_mov_b32_e32 v0, s0
4631 ; VI-NEXT: v_mov_b32_e32 v1, s1
4632 ; VI-NEXT: v_mov_b32_e32 v2, s2
4633 ; VI-NEXT: v_mov_b32_e32 v3, s3
4634 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
4637 ; GFX9-LABEL: f64_kernel_arg:
4638 ; GFX9: ; %bb.0: ; %entry
4639 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4641 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4642 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4643 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4644 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
4645 ; GFX9-NEXT: s_endpgm
4647 ; EG-LABEL: f64_kernel_arg:
4648 ; EG: ; %bb.0: ; %entry
4649 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4650 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4653 ; EG-NEXT: ALU clause starting at 4:
4654 ; EG-NEXT: MOV * T0.Y, KC0[3].X,
4655 ; EG-NEXT: MOV T0.X, KC0[2].W,
4656 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4657 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4659 ; CM-LABEL: f64_kernel_arg:
4660 ; CM: ; %bb.0: ; %entry
4661 ; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
4662 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4665 ; CM-NEXT: ALU clause starting at 4:
4666 ; CM-NEXT: MOV * T0.Y, KC0[3].X,
4667 ; CM-NEXT: MOV * T0.X, KC0[2].W,
4668 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4669 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4671 store double %in, ptr addrspace(1) %out
4675 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
4676 ; XGCN: s_load_dwordx2
4677 ; XGCN: s_load_dwordx2
4678 ; XGCN: buffer_store_dwordx2
4679 ; define amdgpu_kernel void @kernel_arg_v1i64(ptr addrspace(1) %out, <1 x i64> %a) nounwind {
4680 ; store <1 x i64> %a, ptr addrspace(1) %out, align 8
4684 define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
4685 ; SI-LABEL: i65_arg:
4686 ; SI: ; %bb.0: ; %entry
4687 ; SI-NEXT: s_load_dword s4, s[0:1], 0xd
4688 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4689 ; SI-NEXT: s_mov_b32 s7, 0xf000
4690 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4691 ; SI-NEXT: s_and_b32 s8, s4, 1
4692 ; SI-NEXT: s_mov_b32 s6, -1
4693 ; SI-NEXT: s_mov_b32 s4, s0
4694 ; SI-NEXT: s_mov_b32 s5, s1
4695 ; SI-NEXT: v_mov_b32_e32 v0, s2
4696 ; SI-NEXT: v_mov_b32_e32 v1, s3
4697 ; SI-NEXT: v_mov_b32_e32 v2, s8
4698 ; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:8
4699 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4702 ; VI-LABEL: i65_arg:
4703 ; VI: ; %bb.0: ; %entry
4704 ; VI-NEXT: s_load_dword s4, s[0:1], 0x34
4705 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4706 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4707 ; VI-NEXT: s_and_b32 s4, s4, 1
4708 ; VI-NEXT: v_mov_b32_e32 v0, s0
4709 ; VI-NEXT: v_mov_b32_e32 v1, s1
4710 ; VI-NEXT: s_add_u32 s0, s0, 8
4711 ; VI-NEXT: s_addc_u32 s1, s1, 0
4712 ; VI-NEXT: v_mov_b32_e32 v5, s1
4713 ; VI-NEXT: v_mov_b32_e32 v2, s2
4714 ; VI-NEXT: v_mov_b32_e32 v6, s4
4715 ; VI-NEXT: v_mov_b32_e32 v4, s0
4716 ; VI-NEXT: v_mov_b32_e32 v3, s3
4717 ; VI-NEXT: flat_store_byte v[4:5], v6
4718 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
4721 ; GFX9-LABEL: i65_arg:
4722 ; GFX9: ; %bb.0: ; %entry
4723 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
4724 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4725 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4726 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4727 ; GFX9-NEXT: s_and_b32 s4, s6, 1
4728 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4729 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
4730 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4731 ; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8
4732 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
4733 ; GFX9-NEXT: s_endpgm
4735 ; EG-LABEL: i65_arg:
4736 ; EG: ; %bb.0: ; %entry
4737 ; EG-NEXT: ALU 20, @6, KC0[CB0:0-32], KC1[]
4738 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
4739 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
4740 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
4743 ; EG-NEXT: ALU clause starting at 6:
4744 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4745 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
4746 ; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
4747 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4748 ; EG-NEXT: LSHL T1.W, PV.W, literal.x,
4749 ; EG-NEXT: AND_INT * T2.W, KC0[3].Y, 1,
4750 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4751 ; EG-NEXT: LSHL T1.X, PS, PV.W,
4752 ; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
4753 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4754 ; EG-NEXT: MOV T1.Y, 0.0,
4755 ; EG-NEXT: MOV * T1.Z, 0.0,
4756 ; EG-NEXT: LSHR T0.X, T0.W, literal.x,
4757 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
4758 ; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
4759 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
4760 ; EG-NEXT: MOV * T3.X, KC0[3].X,
4761 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4762 ; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
4763 ; EG-NEXT: MOV * T5.X, KC0[2].W,
4764 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4766 ; CM-LABEL: i65_arg:
4767 ; CM: ; %bb.0: ; %entry
4768 ; CM-NEXT: ALU 21, @6, KC0[CB0:0-32], KC1[]
4769 ; CM-NEXT: MEM_RAT MSKOR T1.XW, T5.X
4770 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
4771 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
4774 ; CM-NEXT: ALU clause starting at 6:
4775 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
4776 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
4777 ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
4778 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4779 ; CM-NEXT: LSHL T0.Z, PV.W, literal.x,
4780 ; CM-NEXT: AND_INT * T1.W, KC0[3].Y, 1,
4781 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4782 ; CM-NEXT: LSHL T1.X, PV.W, PV.Z,
4783 ; CM-NEXT: LSHL * T1.W, literal.x, PV.Z,
4784 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4785 ; CM-NEXT: MOV T1.Y, 0.0,
4786 ; CM-NEXT: MOV * T1.Z, 0.0,
4787 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
4788 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4789 ; CM-NEXT: MOV T2.X, KC0[2].W,
4790 ; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
4791 ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
4792 ; CM-NEXT: LSHR * T3.X, PV.W, literal.x,
4793 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4794 ; CM-NEXT: MOV * T4.X, KC0[3].X,
4795 ; CM-NEXT: LSHR * T5.X, T0.W, literal.x,
4796 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4798 store i65 %in, ptr addrspace(1) %out, align 4
4802 define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
4805 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
4806 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4807 ; SI-NEXT: s_mov_b32 s3, 0xf000
4808 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4809 ; SI-NEXT: s_and_b32 s4, s2, 1
4810 ; SI-NEXT: s_mov_b32 s2, -1
4811 ; SI-NEXT: v_mov_b32_e32 v0, s4
4812 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
4817 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
4818 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
4819 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4820 ; VI-NEXT: s_and_b32 s2, s2, 1
4821 ; VI-NEXT: v_mov_b32_e32 v0, s0
4822 ; VI-NEXT: v_mov_b32_e32 v1, s1
4823 ; VI-NEXT: v_mov_b32_e32 v2, s2
4824 ; VI-NEXT: flat_store_byte v[0:1], v2
4827 ; GFX9-LABEL: i1_arg:
4829 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
4830 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4831 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4832 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4833 ; GFX9-NEXT: s_and_b32 s2, s2, 1
4834 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
4835 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
4836 ; GFX9-NEXT: s_endpgm
4840 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
4842 ; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
4843 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
4846 ; EG-NEXT: Fetch clause starting at 6:
4847 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4848 ; EG-NEXT: ALU clause starting at 8:
4849 ; EG-NEXT: MOV * T0.X, 0.0,
4850 ; EG-NEXT: ALU clause starting at 9:
4851 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
4852 ; EG-NEXT: AND_INT * T1.W, T0.X, 1,
4853 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4854 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
4855 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4856 ; EG-NEXT: LSHL T0.X, T1.W, PV.W,
4857 ; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
4858 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4859 ; EG-NEXT: MOV T0.Y, 0.0,
4860 ; EG-NEXT: MOV * T0.Z, 0.0,
4861 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4862 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4866 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
4868 ; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
4869 ; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
4872 ; CM-NEXT: Fetch clause starting at 6:
4873 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4874 ; CM-NEXT: ALU clause starting at 8:
4875 ; CM-NEXT: MOV * T0.X, 0.0,
4876 ; CM-NEXT: ALU clause starting at 9:
4877 ; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
4878 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4879 ; CM-NEXT: AND_INT T0.Z, T0.X, 1,
4880 ; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
4881 ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
4882 ; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
4883 ; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
4884 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
4885 ; CM-NEXT: MOV T0.Y, 0.0,
4886 ; CM-NEXT: MOV * T0.Z, 0.0,
4887 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4888 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4889 store i1 %x, ptr addrspace(1) %out, align 1
4893 define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
4894 ; SI-LABEL: i1_arg_zext_i32:
4896 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
4897 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4898 ; SI-NEXT: s_mov_b32 s3, 0xf000
4899 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4900 ; SI-NEXT: s_and_b32 s4, s2, 1
4901 ; SI-NEXT: s_mov_b32 s2, -1
4902 ; SI-NEXT: v_mov_b32_e32 v0, s4
4903 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4906 ; VI-LABEL: i1_arg_zext_i32:
4908 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
4909 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
4910 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4911 ; VI-NEXT: s_and_b32 s2, s2, 1
4912 ; VI-NEXT: v_mov_b32_e32 v0, s0
4913 ; VI-NEXT: v_mov_b32_e32 v1, s1
4914 ; VI-NEXT: v_mov_b32_e32 v2, s2
4915 ; VI-NEXT: flat_store_dword v[0:1], v2
4918 ; GFX9-LABEL: i1_arg_zext_i32:
4920 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
4921 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4922 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4923 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4924 ; GFX9-NEXT: s_and_b32 s2, s2, 1
4925 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
4926 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4927 ; GFX9-NEXT: s_endpgm
4929 ; EG-LABEL: i1_arg_zext_i32:
4931 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
4933 ; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
4934 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
4937 ; EG-NEXT: Fetch clause starting at 6:
4938 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4939 ; EG-NEXT: ALU clause starting at 8:
4940 ; EG-NEXT: MOV * T0.X, 0.0,
4941 ; EG-NEXT: ALU clause starting at 9:
4942 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4943 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4945 ; CM-LABEL: i1_arg_zext_i32:
4947 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
4949 ; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
4950 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
4953 ; CM-NEXT: Fetch clause starting at 6:
4954 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
4955 ; CM-NEXT: ALU clause starting at 8:
4956 ; CM-NEXT: MOV * T0.X, 0.0,
4957 ; CM-NEXT: ALU clause starting at 9:
4958 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
4959 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4960 %ext = zext i1 %x to i32
4961 store i32 %ext, ptr addrspace(1) %out, align 4
4965 define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
4966 ; SI-LABEL: i1_arg_zext_i64:
4968 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
4969 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4970 ; SI-NEXT: s_mov_b32 s3, 0xf000
4971 ; SI-NEXT: s_mov_b32 s2, -1
4972 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4973 ; SI-NEXT: s_and_b32 s4, s4, 1
4974 ; SI-NEXT: v_mov_b32_e32 v1, 0
4975 ; SI-NEXT: v_mov_b32_e32 v0, s4
4976 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4979 ; VI-LABEL: i1_arg_zext_i64:
4981 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
4982 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
4983 ; VI-NEXT: v_mov_b32_e32 v1, 0
4984 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4985 ; VI-NEXT: s_and_b32 s2, s2, 1
4986 ; VI-NEXT: v_mov_b32_e32 v3, s1
4987 ; VI-NEXT: v_mov_b32_e32 v0, s2
4988 ; VI-NEXT: v_mov_b32_e32 v2, s0
4989 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
4992 ; GFX9-LABEL: i1_arg_zext_i64:
4994 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
4995 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4996 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4997 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4998 ; GFX9-NEXT: s_and_b32 s2, s2, 1
4999 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5000 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
5001 ; GFX9-NEXT: s_endpgm
5003 ; EG-LABEL: i1_arg_zext_i64:
5005 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
5007 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
5008 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5011 ; EG-NEXT: Fetch clause starting at 6:
5012 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5013 ; EG-NEXT: ALU clause starting at 8:
5014 ; EG-NEXT: MOV * T0.X, 0.0,
5015 ; EG-NEXT: ALU clause starting at 9:
5016 ; EG-NEXT: MOV * T0.Y, 0.0,
5017 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
5018 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5020 ; CM-LABEL: i1_arg_zext_i64:
5022 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
5024 ; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
5025 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5028 ; CM-NEXT: Fetch clause starting at 6:
5029 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5030 ; CM-NEXT: ALU clause starting at 8:
5031 ; CM-NEXT: MOV * T0.X, 0.0,
5032 ; CM-NEXT: ALU clause starting at 9:
5033 ; CM-NEXT: MOV * T0.Y, 0.0,
5034 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
5035 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5036 %ext = zext i1 %x to i64
5037 store i64 %ext, ptr addrspace(1) %out, align 8
5041 define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
5042 ; SI-LABEL: i1_arg_sext_i32:
5044 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
5045 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
5046 ; SI-NEXT: s_mov_b32 s3, 0xf000
5047 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5048 ; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
5049 ; SI-NEXT: s_mov_b32 s2, -1
5050 ; SI-NEXT: v_mov_b32_e32 v0, s4
5051 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5054 ; VI-LABEL: i1_arg_sext_i32:
5056 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
5057 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
5058 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5059 ; VI-NEXT: s_bfe_i32 s2, s2, 0x10000
5060 ; VI-NEXT: v_mov_b32_e32 v0, s0
5061 ; VI-NEXT: v_mov_b32_e32 v1, s1
5062 ; VI-NEXT: v_mov_b32_e32 v2, s2
5063 ; VI-NEXT: flat_store_dword v[0:1], v2
5066 ; GFX9-LABEL: i1_arg_sext_i32:
5068 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
5069 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5070 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5071 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5072 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000
5073 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5074 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5075 ; GFX9-NEXT: s_endpgm
5077 ; EG-LABEL: i1_arg_sext_i32:
5079 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
5081 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
5082 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
5085 ; EG-NEXT: Fetch clause starting at 6:
5086 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5087 ; EG-NEXT: ALU clause starting at 8:
5088 ; EG-NEXT: MOV * T0.X, 0.0,
5089 ; EG-NEXT: ALU clause starting at 9:
5090 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
5091 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
5092 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5094 ; CM-LABEL: i1_arg_sext_i32:
5096 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
5098 ; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
5099 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
5102 ; CM-NEXT: Fetch clause starting at 6:
5103 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5104 ; CM-NEXT: ALU clause starting at 8:
5105 ; CM-NEXT: MOV * T0.X, 0.0,
5106 ; CM-NEXT: ALU clause starting at 9:
5107 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1,
5108 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
5109 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5110 %ext = sext i1 %x to i32
5111 store i32 %ext, ptr addrspace(1) %out, align 4
5115 define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
5116 ; SI-LABEL: i1_arg_sext_i64:
5118 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
5119 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
5120 ; SI-NEXT: s_mov_b32 s3, 0xf000
5121 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5122 ; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
5123 ; SI-NEXT: s_mov_b32 s2, -1
5124 ; SI-NEXT: v_mov_b32_e32 v0, s4
5125 ; SI-NEXT: v_mov_b32_e32 v1, s5
5126 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5129 ; VI-LABEL: i1_arg_sext_i64:
5131 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
5132 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
5133 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5134 ; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
5135 ; VI-NEXT: v_mov_b32_e32 v0, s0
5136 ; VI-NEXT: v_mov_b32_e32 v2, s2
5137 ; VI-NEXT: v_mov_b32_e32 v1, s1
5138 ; VI-NEXT: v_mov_b32_e32 v3, s3
5139 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
5142 ; GFX9-LABEL: i1_arg_sext_i64:
5144 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8
5145 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
5146 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5147 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5148 ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
5149 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5150 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5151 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5152 ; GFX9-NEXT: s_endpgm
5154 ; EG-LABEL: i1_arg_sext_i64:
5156 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
5158 ; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
5159 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5162 ; EG-NEXT: Fetch clause starting at 6:
5163 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5164 ; EG-NEXT: ALU clause starting at 8:
5165 ; EG-NEXT: MOV * T0.X, 0.0,
5166 ; EG-NEXT: ALU clause starting at 9:
5167 ; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1,
5168 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
5169 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5170 ; EG-NEXT: MOV * T0.Y, PV.X,
5172 ; CM-LABEL: i1_arg_sext_i64:
5174 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
5176 ; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
5177 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5180 ; CM-NEXT: Fetch clause starting at 6:
5181 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
5182 ; CM-NEXT: ALU clause starting at 8:
5183 ; CM-NEXT: MOV * T0.X, 0.0,
5184 ; CM-NEXT: ALU clause starting at 9:
5185 ; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1,
5186 ; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
5187 ; CM-NEXT: MOV * T0.Y, PV.X,
5188 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
5189 %ext = sext i1 %x to i64
5190 store i64 %ext, ptr addrspace(1) %out, align 8
5194 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
5195 ; SI-LABEL: empty_struct_arg:
5199 ; VI-LABEL: empty_struct_arg:
5203 ; GFX9-LABEL: empty_struct_arg:
5205 ; GFX9-NEXT: s_endpgm
5207 ; EGCM-LABEL: empty_struct_arg:
5214 ; The correct load offsets for these:
5220 ; With the SelectionDAG argument lowering, the alignments for the
5221 ; struct members is not properly considered, making these wrong.
5223 ; FIXME: Total argument size is computed wrong
5224 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
5225 ; SI-LABEL: struct_argument_alignment:
5227 ; SI-NEXT: s_load_dword s8, s[0:1], 0x9
5228 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
5229 ; SI-NEXT: s_load_dword s9, s[0:1], 0xf
5230 ; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11
5231 ; SI-NEXT: s_mov_b32 s0, 0
5232 ; SI-NEXT: s_mov_b32 s3, 0xf000
5233 ; SI-NEXT: s_mov_b32 s2, -1
5234 ; SI-NEXT: s_mov_b32 s1, s0
5235 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5236 ; SI-NEXT: v_mov_b32_e32 v0, s8
5237 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5238 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5239 ; SI-NEXT: v_mov_b32_e32 v0, s4
5240 ; SI-NEXT: v_mov_b32_e32 v1, s5
5241 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5242 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5243 ; SI-NEXT: v_mov_b32_e32 v0, s9
5244 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5245 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5246 ; SI-NEXT: v_mov_b32_e32 v0, s6
5247 ; SI-NEXT: v_mov_b32_e32 v1, s7
5248 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5249 ; SI-NEXT: s_waitcnt vmcnt(0)
5252 ; VI-LABEL: struct_argument_alignment:
5254 ; VI-NEXT: s_load_dword s4, s[0:1], 0x24
5255 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5256 ; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
5257 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
5258 ; VI-NEXT: v_mov_b32_e32 v0, 0
5259 ; VI-NEXT: v_mov_b32_e32 v1, 0
5260 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5261 ; VI-NEXT: v_mov_b32_e32 v2, s4
5262 ; VI-NEXT: flat_store_dword v[0:1], v2
5263 ; VI-NEXT: s_waitcnt vmcnt(0)
5264 ; VI-NEXT: v_mov_b32_e32 v2, s2
5265 ; VI-NEXT: v_mov_b32_e32 v3, s3
5266 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
5267 ; VI-NEXT: s_waitcnt vmcnt(0)
5268 ; VI-NEXT: v_mov_b32_e32 v2, s5
5269 ; VI-NEXT: flat_store_dword v[0:1], v2
5270 ; VI-NEXT: s_waitcnt vmcnt(0)
5271 ; VI-NEXT: v_mov_b32_e32 v3, s1
5272 ; VI-NEXT: v_mov_b32_e32 v2, s0
5273 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
5274 ; VI-NEXT: s_waitcnt vmcnt(0)
5277 ; GFX9-LABEL: struct_argument_alignment:
5279 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
5280 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
5281 ; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18
5282 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20
5283 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5284 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5285 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5286 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
5287 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
5288 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5289 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5290 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
5291 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
5292 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5293 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
5294 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
5295 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5296 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
5297 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
5298 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
5299 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5300 ; GFX9-NEXT: s_endpgm
5302 ; EG-LABEL: struct_argument_alignment:
5304 ; EG-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[]
5305 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0
5306 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
5307 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0
5308 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0
5309 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0
5310 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
5312 ; EG-NEXT: ALU clause starting at 8:
5313 ; EG-NEXT: MOV T0.X, KC0[4].Y,
5314 ; EG-NEXT: MOV * T1.X, KC0[4].Z,
5315 ; EG-NEXT: MOV T2.X, KC0[3].W,
5316 ; EG-NEXT: MOV * T3.X, KC0[2].W,
5317 ; EG-NEXT: MOV T4.X, literal.x,
5318 ; EG-NEXT: MOV * T5.X, KC0[3].X,
5319 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5320 ; EG-NEXT: MOV T6.X, literal.x,
5321 ; EG-NEXT: MOV * T7.X, KC0[2].Y,
5322 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5324 ; CM-LABEL: struct_argument_alignment:
5326 ; CM-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[]
5327 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X
5328 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X
5329 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X
5330 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X
5331 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5332 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X
5334 ; CM-NEXT: ALU clause starting at 8:
5335 ; CM-NEXT: MOV * T0.X, KC0[4].Y,
5336 ; CM-NEXT: MOV * T1.X, KC0[4].Z,
5337 ; CM-NEXT: MOV * T2.X, KC0[3].W,
5338 ; CM-NEXT: MOV * T3.X, KC0[2].W,
5339 ; CM-NEXT: MOV * T4.X, literal.x,
5340 ; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5341 ; CM-NEXT: MOV * T5.X, KC0[3].X,
5342 ; CM-NEXT: MOV * T6.X, literal.x,
5343 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5344 ; CM-NEXT: MOV * T7.X, KC0[2].Y,
5345 %val0 = extractvalue {i32, i64} %arg0, 0
5346 %val1 = extractvalue {i32, i64} %arg0, 1
5347 %val2 = extractvalue {i32, i64} %arg1, 0
5348 %val3 = extractvalue {i32, i64} %arg1, 1
5349 store volatile i32 %val0, ptr addrspace(1) null
5350 store volatile i64 %val1, ptr addrspace(1) null
5351 store volatile i32 %val2, ptr addrspace(1) null
5352 store volatile i64 %val3, ptr addrspace(1) null
5356 ; No padding between i8 and next struct, but round up at end to 4 byte
5358 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
5359 ; SI-LABEL: packed_struct_argument_alignment:
5361 ; SI-NEXT: s_mov_b32 s3, 0xf000
5362 ; SI-NEXT: s_mov_b32 s2, -1
5363 ; SI-NEXT: s_load_dword s6, s[0:1], 0x9
5364 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xa
5365 ; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:49
5366 ; SI-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:50
5367 ; SI-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:51
5368 ; SI-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:52
5369 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:53
5370 ; SI-NEXT: s_mov_b32 s0, 0
5371 ; SI-NEXT: s_mov_b32 s1, s0
5372 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5373 ; SI-NEXT: v_mov_b32_e32 v2, s6
5374 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
5375 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5376 ; SI-NEXT: v_mov_b32_e32 v2, s4
5377 ; SI-NEXT: v_mov_b32_e32 v3, s5
5378 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
5379 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5380 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
5381 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
5382 ; SI-NEXT: v_or_b32_e32 v2, v2, v4
5383 ; SI-NEXT: v_or_b32_e32 v3, v3, v6
5384 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
5385 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
5386 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
5387 ; SI-NEXT: s_waitcnt vmcnt(0)
5388 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5389 ; SI-NEXT: s_waitcnt vmcnt(0)
5392 ; VI-LABEL: packed_struct_argument_alignment:
5394 ; VI-NEXT: s_add_u32 s2, s0, 49
5395 ; VI-NEXT: s_addc_u32 s3, s1, 0
5396 ; VI-NEXT: s_add_u32 s4, s0, 50
5397 ; VI-NEXT: s_addc_u32 s5, s1, 0
5398 ; VI-NEXT: v_mov_b32_e32 v2, s2
5399 ; VI-NEXT: v_mov_b32_e32 v3, s3
5400 ; VI-NEXT: s_add_u32 s2, s2, 3
5401 ; VI-NEXT: s_addc_u32 s3, s3, 0
5402 ; VI-NEXT: v_mov_b32_e32 v5, s3
5403 ; VI-NEXT: v_mov_b32_e32 v4, s2
5404 ; VI-NEXT: s_add_u32 s2, s0, 51
5405 ; VI-NEXT: s_addc_u32 s3, s1, 0
5406 ; VI-NEXT: v_mov_b32_e32 v0, s4
5407 ; VI-NEXT: v_mov_b32_e32 v7, s3
5408 ; VI-NEXT: v_mov_b32_e32 v1, s5
5409 ; VI-NEXT: v_mov_b32_e32 v6, s2
5410 ; VI-NEXT: flat_load_ubyte v8, v[0:1]
5411 ; VI-NEXT: flat_load_ubyte v9, v[2:3]
5412 ; VI-NEXT: flat_load_ubyte v10, v[4:5]
5413 ; VI-NEXT: flat_load_ubyte v6, v[6:7]
5414 ; VI-NEXT: s_add_u32 s2, s0, 53
5415 ; VI-NEXT: s_addc_u32 s3, s1, 0
5416 ; VI-NEXT: v_mov_b32_e32 v0, s2
5417 ; VI-NEXT: v_mov_b32_e32 v1, s3
5418 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
5419 ; VI-NEXT: s_load_dword s2, s[0:1], 0x24
5420 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28
5421 ; VI-NEXT: v_mov_b32_e32 v2, 0
5422 ; VI-NEXT: v_mov_b32_e32 v3, 0
5423 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5424 ; VI-NEXT: v_mov_b32_e32 v7, s2
5425 ; VI-NEXT: v_mov_b32_e32 v5, s1
5426 ; VI-NEXT: v_mov_b32_e32 v4, s0
5427 ; VI-NEXT: flat_store_dword v[2:3], v7
5428 ; VI-NEXT: s_waitcnt vmcnt(0)
5429 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
5430 ; VI-NEXT: s_waitcnt vmcnt(0)
5431 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
5432 ; VI-NEXT: v_or_b32_e32 v4, v4, v9
5433 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
5434 ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5435 ; VI-NEXT: v_or_b32_e32 v4, v5, v4
5436 ; VI-NEXT: flat_store_dword v[2:3], v4
5437 ; VI-NEXT: s_waitcnt vmcnt(0)
5438 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5439 ; VI-NEXT: s_waitcnt vmcnt(0)
5442 ; GFX9-LABEL: packed_struct_argument_alignment:
5444 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5445 ; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13
5446 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17
5447 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
5448 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
5449 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5450 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
5451 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5452 ; GFX9-NEXT: v_mov_b32_e32 v7, s2
5453 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
5454 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
5455 ; GFX9-NEXT: global_store_dword v[2:3], v7, off
5456 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5457 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
5458 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5459 ; GFX9-NEXT: global_store_dword v[2:3], v6, off
5460 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5461 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
5462 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5463 ; GFX9-NEXT: s_endpgm
5465 ; EG-LABEL: packed_struct_argument_alignment:
5467 ; EG-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[]
5468 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
5469 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5470 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5471 ; EG-NEXT: ALU 2, @25, KC0[], KC1[]
5472 ; EG-NEXT: TEX 0 @12
5473 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5474 ; EG-NEXT: TEX 0 @14
5475 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5476 ; EG-NEXT: TEX 0 @16
5477 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1
5479 ; EG-NEXT: Fetch clause starting at 12:
5480 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3
5481 ; EG-NEXT: Fetch clause starting at 14:
5482 ; EG-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3
5483 ; EG-NEXT: Fetch clause starting at 16:
5484 ; EG-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3
5485 ; EG-NEXT: ALU clause starting at 18:
5486 ; EG-NEXT: MOV T0.X, KC0[2].Z,
5487 ; EG-NEXT: MOV * T1.X, literal.x,
5488 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5489 ; EG-NEXT: MOV T2.X, KC0[2].W,
5490 ; EG-NEXT: MOV * T3.X, literal.x,
5491 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5492 ; EG-NEXT: MOV * T4.X, KC0[2].Y,
5493 ; EG-NEXT: ALU clause starting at 25:
5494 ; EG-NEXT: MOV T0.X, 0.0,
5495 ; EG-NEXT: MOV * T2.X, 0.0,
5496 ; EG-NEXT: MOV * T4.X, 0.0,
5498 ; CM-LABEL: packed_struct_argument_alignment:
5500 ; CM-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[]
5501 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5502 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5503 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5504 ; CM-NEXT: ALU 2, @25, KC0[], KC1[]
5505 ; CM-NEXT: TEX 0 @12
5506 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5507 ; CM-NEXT: TEX 0 @14
5508 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5509 ; CM-NEXT: TEX 0 @16
5510 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5512 ; CM-NEXT: Fetch clause starting at 12:
5513 ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3
5514 ; CM-NEXT: Fetch clause starting at 14:
5515 ; CM-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3
5516 ; CM-NEXT: Fetch clause starting at 16:
5517 ; CM-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3
5518 ; CM-NEXT: ALU clause starting at 18:
5519 ; CM-NEXT: MOV * T0.X, KC0[2].Z,
5520 ; CM-NEXT: MOV * T1.X, literal.x,
5521 ; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5522 ; CM-NEXT: MOV * T2.X, KC0[2].W,
5523 ; CM-NEXT: MOV * T3.X, literal.x,
5524 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5525 ; CM-NEXT: MOV * T4.X, KC0[2].Y,
5526 ; CM-NEXT: ALU clause starting at 25:
5527 ; CM-NEXT: MOV * T0.X, 0.0,
5528 ; CM-NEXT: MOV * T2.X, 0.0,
5529 ; CM-NEXT: MOV * T4.X, 0.0,
5530 %val0 = extractvalue <{i32, i64}> %arg0, 0
5531 %val1 = extractvalue <{i32, i64}> %arg0, 1
5532 %val2 = extractvalue <{i32, i64}> %arg1, 0
5533 %val3 = extractvalue <{i32, i64}> %arg1, 1
5534 store volatile i32 %val0, ptr addrspace(1) null
5535 store volatile i64 %val1, ptr addrspace(1) null
5536 store volatile i32 %val2, ptr addrspace(1) null
5537 store volatile i64 %val3, ptr addrspace(1) null
5541 define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
5542 ; SI-LABEL: struct_argument_alignment_after:
5544 ; SI-NEXT: s_load_dword s12, s[0:1], 0x9
5545 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
5546 ; SI-NEXT: s_load_dword s13, s[0:1], 0xf
5547 ; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11
5548 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15
5549 ; SI-NEXT: s_mov_b32 s4, 0
5550 ; SI-NEXT: s_mov_b32 s7, 0xf000
5551 ; SI-NEXT: s_mov_b32 s6, -1
5552 ; SI-NEXT: s_mov_b32 s5, s4
5553 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5554 ; SI-NEXT: v_mov_b32_e32 v0, s12
5555 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5556 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5557 ; SI-NEXT: v_mov_b32_e32 v0, s8
5558 ; SI-NEXT: v_mov_b32_e32 v1, s9
5559 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5560 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5561 ; SI-NEXT: v_mov_b32_e32 v0, s13
5562 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5563 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5564 ; SI-NEXT: v_mov_b32_e32 v0, s10
5565 ; SI-NEXT: v_mov_b32_e32 v1, s11
5566 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5567 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5568 ; SI-NEXT: v_mov_b32_e32 v0, s0
5569 ; SI-NEXT: v_mov_b32_e32 v1, s1
5570 ; SI-NEXT: v_mov_b32_e32 v2, s2
5571 ; SI-NEXT: v_mov_b32_e32 v3, s3
5572 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5573 ; SI-NEXT: s_waitcnt vmcnt(0)
5576 ; VI-LABEL: struct_argument_alignment_after:
5578 ; VI-NEXT: s_load_dword s8, s[0:1], 0x24
5579 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
5580 ; VI-NEXT: s_load_dword s9, s[0:1], 0x3c
5581 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44
5582 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
5583 ; VI-NEXT: v_mov_b32_e32 v4, 0
5584 ; VI-NEXT: v_mov_b32_e32 v5, 0
5585 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5586 ; VI-NEXT: v_mov_b32_e32 v0, s8
5587 ; VI-NEXT: flat_store_dword v[4:5], v0
5588 ; VI-NEXT: s_waitcnt vmcnt(0)
5589 ; VI-NEXT: v_mov_b32_e32 v0, s4
5590 ; VI-NEXT: v_mov_b32_e32 v1, s5
5591 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
5592 ; VI-NEXT: s_waitcnt vmcnt(0)
5593 ; VI-NEXT: v_mov_b32_e32 v0, s9
5594 ; VI-NEXT: flat_store_dword v[4:5], v0
5595 ; VI-NEXT: s_waitcnt vmcnt(0)
5596 ; VI-NEXT: v_mov_b32_e32 v0, s6
5597 ; VI-NEXT: v_mov_b32_e32 v1, s7
5598 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
5599 ; VI-NEXT: s_waitcnt vmcnt(0)
5600 ; VI-NEXT: v_mov_b32_e32 v0, s0
5601 ; VI-NEXT: v_mov_b32_e32 v1, s1
5602 ; VI-NEXT: v_mov_b32_e32 v2, s2
5603 ; VI-NEXT: v_mov_b32_e32 v3, s3
5604 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
5605 ; VI-NEXT: s_waitcnt vmcnt(0)
5608 ; GFX9-LABEL: struct_argument_alignment_after:
5610 ; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0
5611 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
5612 ; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18
5613 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20
5614 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30
5615 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5616 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
5617 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5618 ; GFX9-NEXT: v_mov_b32_e32 v0, s10
5619 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
5620 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5621 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
5622 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
5623 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
5624 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5625 ; GFX9-NEXT: v_mov_b32_e32 v0, s11
5626 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
5627 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5628 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
5629 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
5630 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
5631 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5632 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5633 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5634 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
5635 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
5636 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
5637 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5638 ; GFX9-NEXT: s_endpgm
5640 ; EG-LABEL: struct_argument_alignment_after:
5642 ; EG-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[]
5643 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0
5644 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0
5645 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
5646 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0
5647 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0
5648 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0
5649 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1
5652 ; EG-NEXT: ALU clause starting at 10:
5653 ; EG-NEXT: MOV * T0.W, KC0[6].X,
5654 ; EG-NEXT: MOV * T0.Z, KC0[5].W,
5655 ; EG-NEXT: MOV * T0.Y, KC0[5].Z,
5656 ; EG-NEXT: MOV T0.X, KC0[5].Y,
5657 ; EG-NEXT: MOV * T1.X, KC0[4].Y,
5658 ; EG-NEXT: MOV T2.X, KC0[4].Z,
5659 ; EG-NEXT: MOV * T3.X, KC0[3].W,
5660 ; EG-NEXT: MOV T4.X, KC0[2].W,
5661 ; EG-NEXT: MOV * T5.X, literal.x,
5662 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5663 ; EG-NEXT: MOV T6.X, KC0[3].X,
5664 ; EG-NEXT: MOV * T7.X, literal.x,
5665 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5666 ; EG-NEXT: MOV * T8.X, KC0[2].Y,
5668 ; CM-LABEL: struct_argument_alignment_after:
5670 ; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[]
5671 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X
5672 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X
5673 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X
5674 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X
5675 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X
5676 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X
5677 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T7.X
5680 ; CM-NEXT: ALU clause starting at 10:
5681 ; CM-NEXT: MOV * T0.W, KC0[6].X,
5682 ; CM-NEXT: MOV * T0.Z, KC0[5].W,
5683 ; CM-NEXT: MOV * T0.Y, KC0[5].Z,
5684 ; CM-NEXT: MOV * T0.X, KC0[5].Y,
5685 ; CM-NEXT: MOV * T1.X, KC0[4].Y,
5686 ; CM-NEXT: MOV * T2.X, KC0[4].Z,
5687 ; CM-NEXT: MOV * T3.X, KC0[3].W,
5688 ; CM-NEXT: MOV * T4.X, KC0[2].W,
5689 ; CM-NEXT: MOV * T5.X, literal.x,
5690 ; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
5691 ; CM-NEXT: MOV * T6.X, KC0[3].X,
5692 ; CM-NEXT: MOV * T7.X, literal.x,
5693 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5694 ; CM-NEXT: MOV * T8.X, KC0[2].Y,
5695 %val0 = extractvalue {i32, i64} %arg0, 0
5696 %val1 = extractvalue {i32, i64} %arg0, 1
5697 %val2 = extractvalue {i32, i64} %arg2, 0
5698 %val3 = extractvalue {i32, i64} %arg2, 1
5699 store volatile i32 %val0, ptr addrspace(1) null
5700 store volatile i64 %val1, ptr addrspace(1) null
5701 store volatile i32 %val2, ptr addrspace(1) null
5702 store volatile i64 %val3, ptr addrspace(1) null
5703 store volatile <4 x i32> %arg4, ptr addrspace(1) null
5707 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
5708 ; SI-LABEL: array_3xi32:
5710 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5711 ; SI-NEXT: s_mov_b32 s7, 0xf000
5712 ; SI-NEXT: s_mov_b32 s6, -1
5713 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5714 ; SI-NEXT: v_mov_b32_e32 v0, s0
5715 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
5716 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5717 ; SI-NEXT: v_mov_b32_e32 v0, s3
5718 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5719 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5720 ; SI-NEXT: v_mov_b32_e32 v0, s2
5721 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5722 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5723 ; SI-NEXT: v_mov_b32_e32 v0, s1
5724 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5725 ; SI-NEXT: s_waitcnt vmcnt(0)
5728 ; VI-LABEL: array_3xi32:
5730 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5731 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5732 ; VI-NEXT: v_mov_b32_e32 v0, s0
5733 ; VI-NEXT: v_mov_b32_e32 v1, s3
5734 ; VI-NEXT: v_mov_b32_e32 v2, s2
5735 ; VI-NEXT: flat_store_short v[0:1], v0
5736 ; VI-NEXT: s_waitcnt vmcnt(0)
5737 ; VI-NEXT: flat_store_dword v[0:1], v1
5738 ; VI-NEXT: s_waitcnt vmcnt(0)
5739 ; VI-NEXT: flat_store_dword v[0:1], v2
5740 ; VI-NEXT: s_waitcnt vmcnt(0)
5741 ; VI-NEXT: v_mov_b32_e32 v0, s1
5742 ; VI-NEXT: flat_store_dword v[0:1], v0
5743 ; VI-NEXT: s_waitcnt vmcnt(0)
5746 ; GFX9-LABEL: array_3xi32:
5748 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
5749 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5750 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5751 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5752 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
5753 ; GFX9-NEXT: global_store_short v[0:1], v0, off
5754 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5755 ; GFX9-NEXT: global_store_dword v[0:1], v1, off
5756 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5757 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
5758 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5759 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
5760 ; GFX9-NEXT: global_store_dword v[0:1], v0, off
5761 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5762 ; GFX9-NEXT: s_endpgm
5764 ; EG-LABEL: array_3xi32:
5766 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
5768 ; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
5769 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T4.X
5770 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0
5771 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
5772 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1
5774 ; EG-NEXT: Fetch clause starting at 8:
5775 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
5776 ; EG-NEXT: ALU clause starting at 10:
5777 ; EG-NEXT: MOV * T0.X, 0.0,
5778 ; EG-NEXT: ALU clause starting at 11:
5779 ; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
5780 ; EG-NEXT: MOV * T0.W, literal.x,
5781 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5782 ; EG-NEXT: MOV T0.Y, 0.0,
5783 ; EG-NEXT: MOV * T0.Z, 0.0,
5784 ; EG-NEXT: MOV T1.X, KC0[2].Z,
5785 ; EG-NEXT: MOV * T2.X, KC0[2].W,
5786 ; EG-NEXT: MOV T3.X, KC0[3].X,
5787 ; EG-NEXT: MOV * T4.X, literal.x,
5788 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5790 ; CM-LABEL: array_3xi32:
5792 ; CM-NEXT: ALU 0, @10, KC0[], KC1[]
5794 ; CM-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
5795 ; CM-NEXT: MEM_RAT MSKOR T0.XW, T4.X
5796 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X
5797 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X
5798 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5800 ; CM-NEXT: Fetch clause starting at 8:
5801 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3
5802 ; CM-NEXT: ALU clause starting at 10:
5803 ; CM-NEXT: MOV * T0.X, 0.0,
5804 ; CM-NEXT: ALU clause starting at 11:
5805 ; CM-NEXT: AND_INT T0.X, T0.X, literal.x,
5806 ; CM-NEXT: MOV * T0.W, literal.x,
5807 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5808 ; CM-NEXT: MOV T0.Y, 0.0,
5809 ; CM-NEXT: MOV * T0.Z, 0.0,
5810 ; CM-NEXT: MOV * T1.X, KC0[2].Z,
5811 ; CM-NEXT: MOV * T2.X, KC0[2].W,
5812 ; CM-NEXT: MOV * T3.X, KC0[3].X,
5813 ; CM-NEXT: MOV * T4.X, literal.x,
5814 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5815 store volatile i16 %arg0, ptr addrspace(1) undef
5816 store volatile [3 x i32] %arg1, ptr addrspace(1) undef
5820 ; FIXME: Why not all scalar loads?
5821 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
5822 ; SI-LABEL: array_3xi16:
5824 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
5825 ; SI-NEXT: s_mov_b32 s3, 0xf000
5826 ; SI-NEXT: s_mov_b32 s2, -1
5827 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42
5828 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:40
5829 ; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:38
5830 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5831 ; SI-NEXT: v_mov_b32_e32 v3, s4
5832 ; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0
5833 ; SI-NEXT: s_waitcnt vmcnt(0)
5834 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
5835 ; SI-NEXT: s_waitcnt vmcnt(0)
5836 ; SI-NEXT: buffer_store_short v1, off, s[0:3], 0
5837 ; SI-NEXT: s_waitcnt vmcnt(0)
5838 ; SI-NEXT: buffer_store_short v2, off, s[0:3], 0
5839 ; SI-NEXT: s_waitcnt vmcnt(0)
5842 ; VI-LABEL: array_3xi16:
5844 ; VI-NEXT: s_add_u32 s2, s0, 38
5845 ; VI-NEXT: s_addc_u32 s3, s1, 0
5846 ; VI-NEXT: s_add_u32 s4, s2, 2
5847 ; VI-NEXT: s_addc_u32 s5, s3, 0
5848 ; VI-NEXT: v_mov_b32_e32 v0, s2
5849 ; VI-NEXT: v_mov_b32_e32 v1, s3
5850 ; VI-NEXT: s_add_u32 s2, s0, 42
5851 ; VI-NEXT: s_addc_u32 s3, s1, 0
5852 ; VI-NEXT: v_mov_b32_e32 v2, s2
5853 ; VI-NEXT: v_mov_b32_e32 v3, s3
5854 ; VI-NEXT: flat_load_ushort v4, v[0:1]
5855 ; VI-NEXT: flat_load_ushort v2, v[2:3]
5856 ; VI-NEXT: v_mov_b32_e32 v0, s4
5857 ; VI-NEXT: v_mov_b32_e32 v1, s5
5858 ; VI-NEXT: flat_load_ushort v0, v[0:1]
5859 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
5860 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5861 ; VI-NEXT: v_mov_b32_e32 v1, s0
5862 ; VI-NEXT: s_waitcnt vmcnt(0)
5863 ; VI-NEXT: flat_store_byte v[0:1], v1
5864 ; VI-NEXT: s_waitcnt vmcnt(0)
5865 ; VI-NEXT: flat_store_short v[0:1], v2
5866 ; VI-NEXT: s_waitcnt vmcnt(0)
5867 ; VI-NEXT: flat_store_short v[0:1], v4
5868 ; VI-NEXT: s_waitcnt vmcnt(0)
5869 ; VI-NEXT: flat_store_short v[0:1], v0
5870 ; VI-NEXT: s_waitcnt vmcnt(0)
5873 ; GFX9-LABEL: array_3xi16:
5875 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5876 ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6
5877 ; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4
5878 ; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2
5879 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
5880 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5881 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5882 ; GFX9-NEXT: s_waitcnt vmcnt(2)
5883 ; GFX9-NEXT: global_store_byte v[0:1], v0, off
5884 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5885 ; GFX9-NEXT: global_store_short v[0:1], v1, off
5886 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5887 ; GFX9-NEXT: global_store_short v[0:1], v2, off
5888 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5889 ; GFX9-NEXT: global_store_short v[0:1], v3, off
5890 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5891 ; GFX9-NEXT: s_endpgm
5893 ; EG-LABEL: array_3xi16:
5895 ; EG-NEXT: ALU 0, @20, KC0[], KC1[]
5896 ; EG-NEXT: TEX 1 @12
5897 ; EG-NEXT: ALU 11, @21, KC0[], KC1[]
5898 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X
5899 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5900 ; EG-NEXT: TEX 0 @16
5901 ; EG-NEXT: ALU 3, @33, KC0[], KC1[]
5902 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5903 ; EG-NEXT: TEX 0 @18
5904 ; EG-NEXT: ALU 3, @37, KC0[], KC1[]
5905 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5907 ; EG-NEXT: Fetch clause starting at 12:
5908 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3
5909 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3
5910 ; EG-NEXT: Fetch clause starting at 16:
5911 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
5912 ; EG-NEXT: Fetch clause starting at 18:
5913 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3
5914 ; EG-NEXT: ALU clause starting at 20:
5915 ; EG-NEXT: MOV * T0.X, 0.0,
5916 ; EG-NEXT: ALU clause starting at 21:
5917 ; EG-NEXT: AND_INT T1.X, T1.X, literal.x,
5918 ; EG-NEXT: MOV * T1.W, literal.x,
5919 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
5920 ; EG-NEXT: MOV * T1.Y, 0.0,
5921 ; EG-NEXT: AND_INT T2.X, T2.X, literal.x,
5922 ; EG-NEXT: MOV * T2.W, literal.x,
5923 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5924 ; EG-NEXT: MOV T2.Y, 0.0,
5925 ; EG-NEXT: MOV T1.Z, 0.0,
5926 ; EG-NEXT: MOV * T2.Z, 0.0,
5927 ; EG-NEXT: MOV * T3.X, literal.x,
5928 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5929 ; EG-NEXT: ALU clause starting at 33:
5930 ; EG-NEXT: AND_INT T2.X, T1.X, literal.x,
5931 ; EG-NEXT: MOV T2.Y, 0.0,
5932 ; EG-NEXT: MOV * T2.Z, 0.0,
5933 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5934 ; EG-NEXT: ALU clause starting at 37:
5935 ; EG-NEXT: AND_INT T2.X, T0.X, literal.x,
5936 ; EG-NEXT: MOV T2.Y, 0.0,
5937 ; EG-NEXT: MOV * T2.Z, 0.0,
5938 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5940 ; CM-LABEL: array_3xi16:
5942 ; CM-NEXT: ALU 0, @20, KC0[], KC1[]
5943 ; CM-NEXT: TEX 1 @12
5944 ; CM-NEXT: ALU 11, @21, KC0[], KC1[]
5945 ; CM-NEXT: MEM_RAT MSKOR T1.XW, T3.X
5946 ; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5947 ; CM-NEXT: TEX 0 @16
5948 ; CM-NEXT: ALU 3, @33, KC0[], KC1[]
5949 ; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5950 ; CM-NEXT: TEX 0 @18
5951 ; CM-NEXT: ALU 3, @37, KC0[], KC1[]
5952 ; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X
5954 ; CM-NEXT: Fetch clause starting at 12:
5955 ; CM-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3
5956 ; CM-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3
5957 ; CM-NEXT: Fetch clause starting at 16:
5958 ; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
5959 ; CM-NEXT: Fetch clause starting at 18:
5960 ; CM-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3
5961 ; CM-NEXT: ALU clause starting at 20:
5962 ; CM-NEXT: MOV * T0.X, 0.0,
5963 ; CM-NEXT: ALU clause starting at 21:
5964 ; CM-NEXT: AND_INT T1.X, T1.X, literal.x,
5965 ; CM-NEXT: MOV * T1.W, literal.x,
5966 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
5967 ; CM-NEXT: MOV * T1.Y, 0.0,
5968 ; CM-NEXT: AND_INT T2.X, T2.X, literal.x,
5969 ; CM-NEXT: MOV * T2.W, literal.x,
5970 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5971 ; CM-NEXT: MOV T2.Y, 0.0,
5972 ; CM-NEXT: MOV * T1.Z, 0.0,
5973 ; CM-NEXT: MOV * T2.Z, 0.0,
5974 ; CM-NEXT: MOV * T3.X, literal.x,
5975 ; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
5976 ; CM-NEXT: ALU clause starting at 33:
5977 ; CM-NEXT: AND_INT T2.X, T1.X, literal.x,
5978 ; CM-NEXT: MOV T2.Y, 0.0,
5979 ; CM-NEXT: MOV * T2.Z, 0.0,
5980 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5981 ; CM-NEXT: ALU clause starting at 37:
5982 ; CM-NEXT: AND_INT T2.X, T0.X, literal.x,
5983 ; CM-NEXT: MOV T2.Y, 0.0,
5984 ; CM-NEXT: MOV * T2.Z, 0.0,
5985 ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
5986 store volatile i8 %arg0, ptr addrspace(1) undef
5987 store volatile [3 x i16] %arg1, ptr addrspace(1) undef
5991 define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
5992 ; SI-LABEL: small_array_round_down_offset:
5994 ; SI-NEXT: s_mov_b32 s3, 0xf000
5995 ; SI-NEXT: s_mov_b32 s2, -1
5996 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37
5997 ; SI-NEXT: s_waitcnt vmcnt(0)
5998 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
5999 ; SI-NEXT: s_waitcnt vmcnt(0)
6002 ; VI-LABEL: small_array_round_down_offset:
6004 ; VI-NEXT: s_add_u32 s0, s0, 37
6005 ; VI-NEXT: s_addc_u32 s1, s1, 0
6006 ; VI-NEXT: v_mov_b32_e32 v0, s0
6007 ; VI-NEXT: v_mov_b32_e32 v1, s1
6008 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
6009 ; VI-NEXT: s_waitcnt vmcnt(0)
6010 ; VI-NEXT: flat_store_byte v[0:1], v0
6011 ; VI-NEXT: s_waitcnt vmcnt(0)
6014 ; GFX9-LABEL: small_array_round_down_offset:
6016 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6017 ; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1
6018 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6019 ; GFX9-NEXT: global_store_byte v[0:1], v0, off
6020 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6021 ; GFX9-NEXT: s_endpgm
6023 ; EGCM-LABEL: small_array_round_down_offset:
6025 ; EGCM-NEXT: ALU 0, @8, KC0[], KC1[]
6026 ; EGCM-NEXT: TEX 0 @6
6027 ; EGCM-NEXT: ALU 6, @9, KC0[], KC1[]
6028 ; EGCM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
6031 ; EGCM-NEXT: Fetch clause starting at 6:
6032 ; EGCM-NEXT: VTX_READ_8 T0.X, T0.X, 37, #3
6033 ; EGCM-NEXT: ALU clause starting at 8:
6034 ; EGCM-NEXT: MOV * T0.X, 0.0,
6035 ; EGCM-NEXT: ALU clause starting at 9:
6036 ; EGCM-NEXT: AND_INT T0.X, T0.X, literal.x,
6037 ; EGCM-NEXT: MOV * T0.W, literal.x,
6038 ; EGCM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
6039 ; EGCM-NEXT: MOV T0.Y, 0.0,
6040 ; EGCM-NEXT: MOV * T0.Z, 0.0,
6041 ; EGCM-NEXT: MOV * T1.X, literal.x,
6042 ; EGCM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
6043 %val = extractvalue [1 x i8] %arg, 0
6044 store volatile i8 %val, ptr addrspace(1) undef
6048 define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
6049 ; SI-LABEL: byref_align_constant_i32_arg:
6051 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49
6052 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6053 ; SI-NEXT: s_mov_b32 s3, 0xf000
6054 ; SI-NEXT: s_mov_b32 s2, -1
6055 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6056 ; SI-NEXT: v_mov_b32_e32 v0, s4
6057 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6058 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6059 ; SI-NEXT: v_mov_b32_e32 v0, s5
6060 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6061 ; SI-NEXT: s_waitcnt vmcnt(0)
6064 ; VI-LABEL: byref_align_constant_i32_arg:
6066 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6067 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124
6068 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6069 ; VI-NEXT: v_mov_b32_e32 v0, s2
6070 ; VI-NEXT: v_mov_b32_e32 v1, s3
6071 ; VI-NEXT: v_mov_b32_e32 v2, s0
6072 ; VI-NEXT: v_mov_b32_e32 v3, s1
6073 ; VI-NEXT: flat_store_dword v[0:1], v2
6074 ; VI-NEXT: s_waitcnt vmcnt(0)
6075 ; VI-NEXT: flat_store_dword v[0:1], v3
6076 ; VI-NEXT: s_waitcnt vmcnt(0)
6079 ; GFX9-LABEL: byref_align_constant_i32_arg:
6081 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
6082 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
6083 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6084 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6085 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
6086 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
6087 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
6088 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6089 ; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
6090 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6091 ; GFX9-NEXT: s_endpgm
6093 ; EG-LABEL: byref_align_constant_i32_arg:
6095 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
6097 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
6098 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0
6099 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6101 ; EG-NEXT: Fetch clause starting at 6:
6102 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
6103 ; EG-NEXT: ALU clause starting at 8:
6104 ; EG-NEXT: MOV * T0.X, KC0[18].Y,
6105 ; EG-NEXT: ALU clause starting at 9:
6106 ; EG-NEXT: MOV T1.X, KC0[18].Z,
6107 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
6108 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6110 ; CM-LABEL: byref_align_constant_i32_arg:
6112 ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
6114 ; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
6115 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
6116 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6118 ; CM-NEXT: Fetch clause starting at 6:
6119 ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
6120 ; CM-NEXT: ALU clause starting at 8:
6121 ; CM-NEXT: MOV * T0.X, KC0[18].Y,
6122 ; CM-NEXT: ALU clause starting at 9:
6123 ; CM-NEXT: MOV * T1.X, KC0[18].Z,
6124 ; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
6125 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6126 %in = load i32, ptr addrspace(4) %in.byref
6127 store volatile i32 %in, ptr addrspace(1) %out, align 4
6128 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
6132 define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
6133 ; SI-LABEL: byref_natural_align_constant_v16i32_arg:
6135 ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19
6136 ; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9
6137 ; SI-NEXT: s_load_dword s0, s[0:1], 0x29
6138 ; SI-NEXT: s_mov_b32 s23, 0xf000
6139 ; SI-NEXT: s_mov_b32 s22, -1
6140 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6141 ; SI-NEXT: v_mov_b32_e32 v0, s16
6142 ; SI-NEXT: v_mov_b32_e32 v1, s17
6143 ; SI-NEXT: v_mov_b32_e32 v2, s18
6144 ; SI-NEXT: v_mov_b32_e32 v3, s19
6145 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48
6146 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6147 ; SI-NEXT: v_mov_b32_e32 v0, s12
6148 ; SI-NEXT: v_mov_b32_e32 v1, s13
6149 ; SI-NEXT: v_mov_b32_e32 v2, s14
6150 ; SI-NEXT: v_mov_b32_e32 v3, s15
6151 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32
6152 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6153 ; SI-NEXT: v_mov_b32_e32 v0, s8
6154 ; SI-NEXT: v_mov_b32_e32 v1, s9
6155 ; SI-NEXT: v_mov_b32_e32 v2, s10
6156 ; SI-NEXT: v_mov_b32_e32 v3, s11
6157 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16
6158 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6159 ; SI-NEXT: v_mov_b32_e32 v0, s4
6160 ; SI-NEXT: v_mov_b32_e32 v1, s5
6161 ; SI-NEXT: v_mov_b32_e32 v2, s6
6162 ; SI-NEXT: v_mov_b32_e32 v3, s7
6163 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
6164 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6165 ; SI-NEXT: v_mov_b32_e32 v0, s0
6166 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0
6167 ; SI-NEXT: s_waitcnt vmcnt(0)
6170 ; VI-LABEL: byref_natural_align_constant_v16i32_arg:
6172 ; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
6173 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6174 ; VI-NEXT: s_load_dword s20, s[0:1], 0xa4
6175 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6176 ; VI-NEXT: v_mov_b32_e32 v0, s16
6177 ; VI-NEXT: s_add_u32 s0, s2, 48
6178 ; VI-NEXT: s_addc_u32 s1, s3, 0
6179 ; VI-NEXT: v_mov_b32_e32 v5, s1
6180 ; VI-NEXT: v_mov_b32_e32 v4, s0
6181 ; VI-NEXT: s_add_u32 s0, s2, 32
6182 ; VI-NEXT: v_mov_b32_e32 v1, s17
6183 ; VI-NEXT: v_mov_b32_e32 v2, s18
6184 ; VI-NEXT: v_mov_b32_e32 v3, s19
6185 ; VI-NEXT: s_addc_u32 s1, s3, 0
6186 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6187 ; VI-NEXT: s_waitcnt vmcnt(0)
6188 ; VI-NEXT: v_mov_b32_e32 v5, s1
6189 ; VI-NEXT: v_mov_b32_e32 v4, s0
6190 ; VI-NEXT: s_add_u32 s0, s2, 16
6191 ; VI-NEXT: v_mov_b32_e32 v0, s12
6192 ; VI-NEXT: v_mov_b32_e32 v1, s13
6193 ; VI-NEXT: v_mov_b32_e32 v2, s14
6194 ; VI-NEXT: v_mov_b32_e32 v3, s15
6195 ; VI-NEXT: s_addc_u32 s1, s3, 0
6196 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6197 ; VI-NEXT: s_waitcnt vmcnt(0)
6198 ; VI-NEXT: v_mov_b32_e32 v5, s1
6199 ; VI-NEXT: v_mov_b32_e32 v0, s8
6200 ; VI-NEXT: v_mov_b32_e32 v1, s9
6201 ; VI-NEXT: v_mov_b32_e32 v2, s10
6202 ; VI-NEXT: v_mov_b32_e32 v3, s11
6203 ; VI-NEXT: v_mov_b32_e32 v4, s0
6204 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6205 ; VI-NEXT: s_waitcnt vmcnt(0)
6206 ; VI-NEXT: v_mov_b32_e32 v5, s3
6207 ; VI-NEXT: v_mov_b32_e32 v0, s4
6208 ; VI-NEXT: v_mov_b32_e32 v1, s5
6209 ; VI-NEXT: v_mov_b32_e32 v2, s6
6210 ; VI-NEXT: v_mov_b32_e32 v3, s7
6211 ; VI-NEXT: v_mov_b32_e32 v4, s2
6212 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
6213 ; VI-NEXT: s_waitcnt vmcnt(0)
6214 ; VI-NEXT: v_mov_b32_e32 v0, s20
6215 ; VI-NEXT: flat_store_dword v[4:5], v0
6216 ; VI-NEXT: s_waitcnt vmcnt(0)
6219 ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg:
6221 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
6222 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
6223 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80
6224 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
6225 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6226 ; GFX9-NEXT: v_mov_b32_e32 v0, s20
6227 ; GFX9-NEXT: v_mov_b32_e32 v1, s21
6228 ; GFX9-NEXT: v_mov_b32_e32 v2, s22
6229 ; GFX9-NEXT: v_mov_b32_e32 v3, s23
6230 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
6231 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6232 ; GFX9-NEXT: v_mov_b32_e32 v0, s16
6233 ; GFX9-NEXT: v_mov_b32_e32 v1, s17
6234 ; GFX9-NEXT: v_mov_b32_e32 v2, s18
6235 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
6236 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
6237 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6238 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
6239 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
6240 ; GFX9-NEXT: v_mov_b32_e32 v2, s14
6241 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
6242 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
6243 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6244 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
6245 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
6246 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
6247 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
6248 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
6249 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6250 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6251 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
6252 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6253 ; GFX9-NEXT: s_endpgm
6255 ; EG-LABEL: byref_natural_align_constant_v16i32_arg:
6257 ; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
6258 ; EG-NEXT: TEX 0 @16
6259 ; EG-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[]
6260 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
6261 ; EG-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[]
6262 ; EG-NEXT: TEX 0 @18
6263 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6264 ; EG-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[]
6265 ; EG-NEXT: TEX 0 @20
6266 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6267 ; EG-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[]
6268 ; EG-NEXT: TEX 0 @22
6269 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
6270 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6273 ; EG-NEXT: Fetch clause starting at 16:
6274 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
6275 ; EG-NEXT: Fetch clause starting at 18:
6276 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
6277 ; EG-NEXT: Fetch clause starting at 20:
6278 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
6279 ; EG-NEXT: Fetch clause starting at 22:
6280 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
6281 ; EG-NEXT: ALU clause starting at 24:
6282 ; EG-NEXT: MOV * T0.X, KC0[6].Y,
6283 ; EG-NEXT: ALU clause starting at 25:
6284 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6285 ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
6286 ; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
6287 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6288 ; EG-NEXT: ALU clause starting at 29:
6289 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6290 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
6291 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
6292 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6293 ; EG-NEXT: ALU clause starting at 33:
6294 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6295 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
6296 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
6297 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6298 ; EG-NEXT: ALU clause starting at 37:
6299 ; EG-NEXT: MOV T1.X, KC0[10].Y,
6300 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
6301 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6303 ; CM-LABEL: byref_natural_align_constant_v16i32_arg:
6305 ; CM-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[]
6306 ; CM-NEXT: TEX 0 @16
6307 ; CM-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[]
6308 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
6309 ; CM-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[]
6310 ; CM-NEXT: TEX 0 @18
6311 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6312 ; CM-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[]
6313 ; CM-NEXT: TEX 0 @20
6314 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6315 ; CM-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[]
6316 ; CM-NEXT: TEX 0 @22
6317 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
6318 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6321 ; CM-NEXT: Fetch clause starting at 16:
6322 ; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
6323 ; CM-NEXT: Fetch clause starting at 18:
6324 ; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1
6325 ; CM-NEXT: Fetch clause starting at 20:
6326 ; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1
6327 ; CM-NEXT: Fetch clause starting at 22:
6328 ; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
6329 ; CM-NEXT: ALU clause starting at 24:
6330 ; CM-NEXT: MOV * T0.X, KC0[6].Y,
6331 ; CM-NEXT: ALU clause starting at 25:
6332 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6333 ; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
6334 ; CM-NEXT: LSHR * T2.X, PV.W, literal.x,
6335 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6336 ; CM-NEXT: ALU clause starting at 29:
6337 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6338 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
6339 ; CM-NEXT: LSHR * T1.X, PV.W, literal.x,
6340 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6341 ; CM-NEXT: ALU clause starting at 33:
6342 ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
6343 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
6344 ; CM-NEXT: LSHR * T1.X, PV.W, literal.x,
6345 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6346 ; CM-NEXT: ALU clause starting at 37:
6347 ; CM-NEXT: MOV * T1.X, KC0[10].Y,
6348 ; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
6349 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
6350 %in = load <16 x i32>, ptr addrspace(4) %in.byref
6351 store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
6352 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4