1 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC
2 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
3 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC
4 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
5 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
7 ; FUNC-LABEL: {{^}}i8_arg:
8 ; HSA-VI: kernarg_segment_alignment = 4
9 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
10 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
11 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
12 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
13 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
14 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
15 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
16 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
17 ; FIXME: Should be using s_load_dword
18 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
20 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
22 %0 = zext i8 %in to i32
23 store i32 %0, i32 addrspace(1)* %out, align 4
27 ; FUNC-LABEL: {{^}}i8_zext_arg:
28 ; HSA-VI: kernarg_segment_alignment = 4
29 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
30 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
31 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
32 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
33 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
34 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
35 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
36 ; FIXME: Should be using s_load_dword
37 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
39 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
41 %0 = zext i8 %in to i32
42 store i32 %0, i32 addrspace(1)* %out, align 4
46 ; FUNC-LABEL: {{^}}i8_sext_arg:
47 ; HSA-VI: kernarg_segment_alignment = 4
48 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
49 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
50 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
51 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
52 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
53 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
54 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
55 ; FIXME: Should be using s_load_dword
56 ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
58 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
60 %0 = sext i8 %in to i32
61 store i32 %0, i32 addrspace(1)* %out, align 4
65 ; FUNC-LABEL: {{^}}i16_arg:
66 ; HSA-VI: kernarg_segment_alignment = 4
67 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
68 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
69 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
70 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
71 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
72 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
73 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
74 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
75 ; FIXME: Should be using s_load_dword
76 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
78 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
80 %0 = zext i16 %in to i32
81 store i32 %0, i32 addrspace(1)* %out, align 4
85 ; FUNC-LABEL: {{^}}i16_zext_arg:
86 ; HSA-VI: kernarg_segment_alignment = 4
87 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
88 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
89 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
90 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
91 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
92 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
93 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
94 ; FIXME: Should be using s_load_dword
95 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
97 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
99 %0 = zext i16 %in to i32
100 store i32 %0, i32 addrspace(1)* %out, align 4
104 ; FUNC-LABEL: {{^}}i16_sext_arg:
105 ; HSA-VI: kernarg_segment_alignment = 4
106 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
107 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
108 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
109 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
110 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
111 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
112 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
113 ; FIXME: Should be using s_load_dword
114 ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
116 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
118 %0 = sext i16 %in to i32
119 store i32 %0, i32 addrspace(1)* %out, align 4
123 ; FUNC-LABEL: {{^}}i32_arg:
124 ; HSA-VI: kernarg_segment_alignment = 4
125 ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
126 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
127 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
128 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
129 define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
131 store i32 %in, i32 addrspace(1)* %out, align 4
135 ; FUNC-LABEL: {{^}}f32_arg:
136 ; HSA-VI: kernarg_segment_alignment = 4
137 ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
138 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
139 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
140 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
141 define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
143 store float %in, float addrspace(1)* %out, align 4
147 ; FUNC-LABEL: {{^}}v2i8_arg:
148 ; HSA-VI: kernarg_segment_alignment = 4
151 ; MESA-GCN: buffer_load_ubyte
152 ; MESA-GCN: buffer_load_ubyte
153 ; HSA-VI: flat_load_ubyte
154 ; HSA-VI: flat_load_ubyte
155 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
157 store <2 x i8> %in, <2 x i8> addrspace(1)* %out
161 ; FUNC-LABEL: {{^}}v2i16_arg:
162 ; HSA-VI: kernarg_segment_alignment = 4
165 ; MESA-GCN: buffer_load_ushort
166 ; MESA-GCN: buffer_load_ushort
167 ; HSA-VI: flat_load_ushort
168 ; HSA-VI: flat_load_ushort
169 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
171 store <2 x i16> %in, <2 x i16> addrspace(1)* %out
175 ; FUNC-LABEL: {{^}}v2i32_arg:
176 ; HSA-VI: kernarg_segment_alignment = 4
177 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
178 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
179 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
180 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
181 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
182 define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
184 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
188 ; FUNC-LABEL: {{^}}v2f32_arg:
189 ; HSA-VI: kernarg_segment_alignment = 4
190 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
191 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
192 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
193 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
194 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
195 define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
197 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
201 ; FUNC-LABEL: {{^}}v3i8_arg:
202 ; HSA-VI: kernarg_segment_alignment = 4
203 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
204 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
205 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
206 ; MESA-GCN: buffer_load_ubyte
207 ; MESA-GCN: buffer_load_ubyte
208 ; MESA-GCN: buffer_load_ubyte
209 ; HSA-VI: flat_load_ubyte
210 ; HSA-VI: flat_load_ubyte
211 ; HSA-VI: flat_load_ubyte
212 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
214 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
218 ; FUNC-LABEL: {{^}}v3i16_arg:
219 ; HSA-VI: kernarg_segment_alignment = 4
220 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
221 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
222 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
223 ; MESA-GCN: buffer_load_ushort
224 ; MESA-GCN: buffer_load_ushort
225 ; MESA-GCN: buffer_load_ushort
226 ; HSA-VI: flat_load_ushort
227 ; HSA-VI: flat_load_ushort
228 ; HSA-VI: flat_load_ushort
229 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
231 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
234 ; FUNC-LABEL: {{^}}v3i32_arg:
235 ; HSA-VI: kernarg_segment_alignment = 4
236 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
237 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
238 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
239 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
240 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
241 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
242 define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
244 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
248 ; FUNC-LABEL: {{^}}v3f32_arg:
249 ; HSA-VI: kernarg_segment_alignment = 4
250 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
251 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
252 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
253 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
254 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
255 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
256 define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
258 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
262 ; FUNC-LABEL: {{^}}v4i8_arg:
263 ; HSA-VI: kernarg_segment_alignment = 4
268 ; MESA-GCN: buffer_load_ubyte
269 ; MESA-GCN: buffer_load_ubyte
270 ; MESA-GCN: buffer_load_ubyte
271 ; MESA-GCN: buffer_load_ubyte
272 ; HSA-VI: flat_load_ubyte
273 ; HSA-VI: flat_load_ubyte
274 ; HSA-VI: flat_load_ubyte
275 ; HSA-VI: flat_load_ubyte
276 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
278 store <4 x i8> %in, <4 x i8> addrspace(1)* %out
282 ; FUNC-LABEL: {{^}}v4i16_arg:
283 ; HSA-VI: kernarg_segment_alignment = 4
288 ; MESA-GCN: buffer_load_ushort
289 ; MESA-GCN: buffer_load_ushort
290 ; MESA-GCN: buffer_load_ushort
291 ; MESA-GCN: buffer_load_ushort
292 ; HSA-GCN: flat_load_ushort
293 ; HSA-GCN: flat_load_ushort
294 ; HSA-GCN: flat_load_ushort
295 ; HSA-GCN: flat_load_ushort
296 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
298 store <4 x i16> %in, <4 x i16> addrspace(1)* %out
302 ; FUNC-LABEL: {{^}}v4i32_arg:
303 ; HSA-VI: kernarg_segment_alignment = 4
304 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
305 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
306 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
307 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
308 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
309 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
310 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
311 define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
313 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
317 ; FUNC-LABEL: {{^}}v4f32_arg:
318 ; HSA-VI: kernarg_segment_alignment = 4
319 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
320 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
321 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
322 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
323 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
324 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
325 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
326 define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
328 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
332 ; FUNC-LABEL: {{^}}v8i8_arg:
333 ; HSA-VI: kernarg_segment_alignment = 4
342 ; MESA-GCN: buffer_load_ubyte
343 ; MESA-GCN: buffer_load_ubyte
344 ; MESA-GCN: buffer_load_ubyte
345 ; MESA-GCN: buffer_load_ubyte
346 ; MESA-GCN: buffer_load_ubyte
347 ; MESA-GCN: buffer_load_ubyte
348 ; MESA-GCN: buffer_load_ubyte
349 ; HSA-GCN: float_load_ubyte
350 ; HSA-GCN: float_load_ubyte
351 ; HSA-GCN: float_load_ubyte
352 ; HSA-GCN: float_load_ubyte
353 ; HSA-GCN: float_load_ubyte
354 ; HSA-GCN: float_load_ubyte
355 ; HSA-GCN: float_load_ubyte
356 ; HSA-GCN: float_load_ubyte
357 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
359 store <8 x i8> %in, <8 x i8> addrspace(1)* %out
363 ; FUNC-LABEL: {{^}}v8i16_arg:
364 ; HSA-VI: kernarg_segment_alignment = 4
373 ; MESA-GCN: buffer_load_ushort
374 ; MESA-GCN: buffer_load_ushort
375 ; MESA-GCN: buffer_load_ushort
376 ; MESA-GCN: buffer_load_ushort
377 ; MESA-GCN: buffer_load_ushort
378 ; MESA-GCN: buffer_load_ushort
379 ; MESA-GCN: buffer_load_ushort
380 ; MESA-GCN: buffer_load_ushort
381 ; HSA-VI: flat_load_ushort
382 ; HSA-VI: flat_load_ushort
383 ; HSA-VI: flat_load_ushort
384 ; HSA-VI: flat_load_ushort
385 ; HSA-VI: flat_load_ushort
386 ; HSA-VI: flat_load_ushort
387 ; HSA-VI: flat_load_ushort
388 ; HSA-VI: flat_load_ushort
389 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
391 store <8 x i16> %in, <8 x i16> addrspace(1)* %out
395 ; FUNC-LABEL: {{^}}v8i32_arg:
396 ; HSA-VI: kernarg_segment_alignment = 5
397 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
398 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
399 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
400 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
401 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
402 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
403 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
404 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
405 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
406 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
407 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
408 define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
410 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
414 ; FUNC-LABEL: {{^}}v8f32_arg:
415 ; HSA-VI: kernarg_segment_alignment = 5
416 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
417 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
418 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
419 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
420 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
421 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
422 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
423 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
424 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
425 define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
427 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
431 ; FUNC-LABEL: {{^}}v16i8_arg:
432 ; HSA-VI: kernarg_segment_alignment = 4
449 ; MESA-GCN: buffer_load_ubyte
450 ; MESA-GCN: buffer_load_ubyte
451 ; MESA-GCN: buffer_load_ubyte
452 ; MESA-GCN: buffer_load_ubyte
453 ; MESA-GCN: buffer_load_ubyte
454 ; MESA-GCN: buffer_load_ubyte
455 ; MESA-GCN: buffer_load_ubyte
456 ; MESA-GCN: buffer_load_ubyte
457 ; MESA-GCN: buffer_load_ubyte
458 ; MESA-GCN: buffer_load_ubyte
459 ; MESA-GCN: buffer_load_ubyte
460 ; MESA-GCN: buffer_load_ubyte
461 ; MESA-GCN: buffer_load_ubyte
462 ; MESA-GCN: buffer_load_ubyte
463 ; MESA-GCN: buffer_load_ubyte
464 ; MESA-GCN: buffer_load_ubyte
465 ; HSA-VI: flat_load_ubyte
466 ; HSA-VI: flat_load_ubyte
467 ; HSA-VI: flat_load_ubyte
468 ; HSA-VI: flat_load_ubyte
469 ; HSA-VI: flat_load_ubyte
470 ; HSA-VI: flat_load_ubyte
471 ; HSA-VI: flat_load_ubyte
472 ; HSA-VI: flat_load_ubyte
473 ; HSA-VI: flat_load_ubyte
474 ; HSA-VI: flat_load_ubyte
475 ; HSA-VI: flat_load_ubyte
476 ; HSA-VI: flat_load_ubyte
477 ; HSA-VI: flat_load_ubyte
478 ; HSA-VI: flat_load_ubyte
479 ; HSA-VI: flat_load_ubyte
480 ; HSA-VI: flat_load_ubyte
481 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
483 store <16 x i8> %in, <16 x i8> addrspace(1)* %out
487 ; FUNC-LABEL: {{^}}v16i16_arg:
488 ; HSA-VI: kernarg_segment_alignment = 5
505 ; MESA-GCN: buffer_load_ushort
506 ; MESA-GCN: buffer_load_ushort
507 ; MESA-GCN: buffer_load_ushort
508 ; MESA-GCN: buffer_load_ushort
509 ; MESA-GCN: buffer_load_ushort
510 ; MESA-GCN: buffer_load_ushort
511 ; MESA-GCN: buffer_load_ushort
512 ; MESA-GCN: buffer_load_ushort
513 ; MESA-GCN: buffer_load_ushort
514 ; MESA-GCN: buffer_load_ushort
515 ; MESA-GCN: buffer_load_ushort
516 ; MESA-GCN: buffer_load_ushort
517 ; MESA-GCN: buffer_load_ushort
518 ; MESA-GCN: buffer_load_ushort
519 ; MESA-GCN: buffer_load_ushort
520 ; MESA-GCN: buffer_load_ushort
521 ; HSA-VI: flat_load_ushort
522 ; HSA-VI: flat_load_ushort
523 ; HSA-VI: flat_load_ushort
524 ; HSA-VI: flat_load_ushort
525 ; HSA-VI: flat_load_ushort
526 ; HSA-VI: flat_load_ushort
527 ; HSA-VI: flat_load_ushort
528 ; HSA-VI: flat_load_ushort
529 ; HSA-VI: flat_load_ushort
530 ; HSA-VI: flat_load_ushort
531 ; HSA-VI: flat_load_ushort
532 ; HSA-VI: flat_load_ushort
533 ; HSA-VI: flat_load_ushort
534 ; HSA-VI: flat_load_ushort
535 ; HSA-VI: flat_load_ushort
536 ; HSA-VI: flat_load_ushort
537 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
539 store <16 x i16> %in, <16 x i16> addrspace(1)* %out
543 ; FUNC-LABEL: {{^}}v16i32_arg:
544 ; HSA-VI: kernarg_segment_alignment = 6
545 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
546 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
547 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
548 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
549 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
550 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
551 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
552 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
553 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
554 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
555 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
556 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
557 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
558 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
559 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
560 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
561 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
562 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
563 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
564 define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
566 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
570 ; FUNC-LABEL: {{^}}v16f32_arg:
571 ; HSA-VI: kernarg_segment_alignment = 6
572 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
573 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
574 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
575 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
576 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
577 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
578 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
579 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
580 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
581 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
582 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
583 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
584 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
585 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
586 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
587 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
588 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
589 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
590 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
591 define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
593 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
597 ; FUNC-LABEL: {{^}}kernel_arg_i64:
598 ; MESA-GCN: s_load_dwordx2
599 ; MESA-GCN: s_load_dwordx2
600 ; MESA-GCN: buffer_store_dwordx2
601 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
602 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
603 store i64 %a, i64 addrspace(1)* %out, align 8
607 ; FUNC-LABEL: {{^}}f64_kernel_arg:
608 ; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
609 ; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
610 ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
611 ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
612 ; MESA-GCN: buffer_store_dwordx2
613 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
614 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
616 store double %in, double addrspace(1)* %out
620 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
621 ; XGCN: s_load_dwordx2
622 ; XGCN: s_load_dwordx2
623 ; XGCN: buffer_store_dwordx2
624 ; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
625 ; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
629 ; FUNC-LABEL: {{^}}i1_arg:
630 ; SI: buffer_load_ubyte
632 ; SI: buffer_store_byte
634 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
635 store i1 %x, i1 addrspace(1)* %out, align 1
639 ; FUNC-LABEL: {{^}}i1_arg_zext_i32:
640 ; SI: buffer_load_ubyte
641 ; SI: buffer_store_dword
643 define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
644 %ext = zext i1 %x to i32
645 store i32 %ext, i32 addrspace(1)* %out, align 4
649 ; FUNC-LABEL: {{^}}i1_arg_zext_i64:
650 ; SI: buffer_load_ubyte
651 ; SI: buffer_store_dwordx2
653 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
654 %ext = zext i1 %x to i64
655 store i64 %ext, i64 addrspace(1)* %out, align 8
659 ; FUNC-LABEL: {{^}}i1_arg_sext_i32:
660 ; SI: buffer_load_ubyte
661 ; SI: buffer_store_dword
663 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
664 %ext = sext i1 %x to i32
665 store i32 %ext, i32addrspace(1)* %out, align 4
669 ; FUNC-LABEL: {{^}}i1_arg_sext_i64:
670 ; SI: buffer_load_ubyte
673 ; SI: buffer_store_dwordx2
675 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
676 %ext = sext i1 %x to i64
677 store i64 %ext, i64 addrspace(1)* %out, align 8