1 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
6 ; FUNC-LABEL: {{^}}store_i1:
8 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
10 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
13 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
15 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
17 ; SI: buffer_store_byte
18 define amdgpu_kernel void @store_i1(ptr addrspace(5) %out) {
20 store i1 true, ptr addrspace(5) %out
25 ; FUNC-LABEL: {{^}}store_i8:
26 ; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
28 ; EG: MOVA_INT * AR.x (MASKED)
29 ; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
31 ; IG 0: Get the byte index and truncate the value
32 ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
33 ; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
34 ; EG-NEXT: 3(4.203895e-45)
37 ; EG: LSHL * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], literal.x, PV.W
38 ; EG-NEXT: 255(3.573311e-43)
41 ; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
42 ; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
43 ; TODO: Is the reload necessary?
44 ; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
45 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
47 ; SI: buffer_store_byte
49 define amdgpu_kernel void @store_i8(ptr addrspace(5) %out, i8 %in) {
51 store i8 %in, ptr addrspace(5) %out
56 ; FUNC-LABEL: {{^}}store_i16:
57 ; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
59 ; EG: MOVA_INT * AR.x (MASKED)
60 ; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
64 ; IG 0: Get the byte index and truncate the value
65 ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
66 ; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
67 ; EG-NEXT: 3(4.203895e-45)
70 ; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
71 ; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
72 ; TODO: Is the reload necessary?
73 ; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
74 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
76 ; SI: buffer_store_short
77 define amdgpu_kernel void @store_i16(ptr addrspace(5) %out, i16 %in) {
79 store i16 %in, ptr addrspace(5) %out
83 ; FUNC-LABEL: {{^}}store_i24:
84 ; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
85 ; SI-DAG: buffer_store_byte
86 ; SI-DAG: buffer_store_short
89 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
91 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
92 ; TODO: This load and store can be eliminated
94 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
96 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
99 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
101 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
102 ; TODO: This load and store can be eliminated
104 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
106 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
107 define amdgpu_kernel void @store_i24(ptr addrspace(5) %out, i24 %in) {
109 store i24 %in, ptr addrspace(5) %out
113 ; FUNC-LABEL: {{^}}store_i25:
114 ; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
115 ; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
116 ; SI: buffer_store_dword [[VAND]]
119 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
123 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
125 define amdgpu_kernel void @store_i25(ptr addrspace(5) %out, i25 %in) {
127 store i25 %in, ptr addrspace(5) %out
131 ; FUNC-LABEL: {{^}}store_v2i8:
132 ; v2i8 is naturally 2B aligned, treat as i16
134 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
136 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
140 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
142 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
145 ; SI: buffer_store_short
146 define amdgpu_kernel void @store_v2i8(ptr addrspace(5) %out, <2 x i32> %in) {
148 %0 = trunc <2 x i32> %in to <2 x i8>
149 store <2 x i8> %0, ptr addrspace(5) %out
153 ; FUNC-LABEL: {{^}}store_v2i8_unaligned:
155 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
157 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
158 ; TODO: This load and store cannot be eliminated,
159 ; they might be different locations
161 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
163 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
166 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
168 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
169 ; TODO: This load and store cannot be eliminated,
170 ; they might be different locations
172 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
174 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
176 ; SI: buffer_store_byte
177 define amdgpu_kernel void @store_v2i8_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
179 %0 = trunc <2 x i32> %in to <2 x i8>
180 store <2 x i8> %0, ptr addrspace(5) %out, align 1
185 ; FUNC-LABEL: {{^}}store_v2i16:
186 ; v2i8 is naturally 2B aligned, treat as i16
188 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
192 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
195 ; SI: buffer_store_dword
196 define amdgpu_kernel void @store_v2i16(ptr addrspace(5) %out, <2 x i32> %in) {
198 %0 = trunc <2 x i32> %in to <2 x i16>
199 store <2 x i16> %0, ptr addrspace(5) %out
203 ; FUNC-LABEL: {{^}}store_v2i16_unaligned:
205 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
207 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
208 ; TODO: This load and store cannot be eliminated,
209 ; they might be different locations
211 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
213 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
216 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
218 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
219 ; TODO: This load and store cannot be eliminated,
220 ; they might be different locations
222 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
224 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
226 ; SI: buffer_store_short
227 ; SI: buffer_store_short
228 define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
230 %0 = trunc <2 x i32> %in to <2 x i16>
231 store <2 x i16> %0, ptr addrspace(5) %out, align 2
235 ; FUNC-LABEL: {{^}}store_v4i8:
237 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
241 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
244 ; SI: buffer_store_dword
245 define amdgpu_kernel void @store_v4i8(ptr addrspace(5) %out, <4 x i32> %in) {
247 %0 = trunc <4 x i32> %in to <4 x i8>
248 store <4 x i8> %0, ptr addrspace(5) %out
252 ; FUNC-LABEL: {{^}}store_v4i8_unaligned:
254 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
256 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
257 ; TODO: This load and store cannot be eliminated,
258 ; they might be different locations
260 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
262 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
263 ; TODO: This load and store cannot be eliminated,
264 ; they might be different locations
266 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
268 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
269 ; TODO: This load and store cannot be eliminated,
270 ; they might be different locations
272 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
274 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
277 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
279 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
280 ; TODO: This load and store cannot be eliminated,
281 ; they might be different locations
283 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
285 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
286 ; TODO: This load and store cannot be eliminated,
287 ; they might be different locations
289 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
291 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
292 ; TODO: This load and store cannot be eliminated,
293 ; they might be different locations
295 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
297 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
299 ; SI: buffer_store_byte
300 ; SI: buffer_store_byte
301 ; SI: buffer_store_byte
302 ; SI: buffer_store_byte
303 ; SI-NOT: buffer_store_dword
304 define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
306 %0 = trunc <4 x i32> %in to <4 x i8>
307 store <4 x i8> %0, ptr addrspace(5) %out, align 1
311 ; FUNC-LABEL: {{^}}store_v8i8_unaligned:
313 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
315 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
316 ; TODO: This load and store cannot be eliminated,
317 ; they might be different locations
319 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
321 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
322 ; TODO: This load and store cannot be eliminated,
323 ; they might be different locations
325 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
327 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
328 ; TODO: This load and store cannot be eliminated,
329 ; they might be different locations
331 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
333 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
334 ; TODO: This load and store cannot be eliminated,
335 ; they might be different locations
337 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
339 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
340 ; TODO: This load and store cannot be eliminated,
341 ; they might be different locations
343 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
345 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
346 ; TODO: This load and store cannot be eliminated,
347 ; they might be different locations
349 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
351 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
352 ; TODO: This load and store cannot be eliminated,
353 ; they might be different locations
355 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
357 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
360 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
362 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
363 ; TODO: This load and store cannot be eliminated,
364 ; they might be different locations
366 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
368 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
369 ; TODO: This load and store cannot be eliminated,
370 ; they might be different locations
372 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
374 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
375 ; TODO: This load and store cannot be eliminated,
376 ; they might be different locations
378 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
380 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
381 ; TODO: This load and store cannot be eliminated,
382 ; they might be different locations
384 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
386 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
387 ; TODO: This load and store cannot be eliminated,
388 ; they might be different locations
390 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
392 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
393 ; TODO: This load and store cannot be eliminated,
394 ; they might be different locations
396 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
398 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
399 ; TODO: This load and store cannot be eliminated,
400 ; they might be different locations
402 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
404 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
406 ; SI: buffer_store_byte
407 ; SI: buffer_store_byte
408 ; SI: buffer_store_byte
409 ; SI: buffer_store_byte
410 ; SI: buffer_store_byte
411 ; SI: buffer_store_byte
412 ; SI: buffer_store_byte
413 ; SI: buffer_store_byte
414 ; SI-NOT: buffer_store_dword
415 define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32> %in) {
417 %0 = trunc <8 x i32> %in to <8 x i8>
418 store <8 x i8> %0, ptr addrspace(5) %out, align 1
422 ; FUNC-LABEL: {{^}}store_v4i8_halfaligned:
424 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
426 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
427 ; TODO: This load and store cannot be eliminated,
428 ; they might be different locations
430 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
432 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
435 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
437 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
438 ; TODO: This load and store cannot be eliminated,
439 ; they might be different locations
441 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
443 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
445 ; SI: buffer_store_short
446 ; SI: buffer_store_short
447 ; SI-NOT: buffer_store_dword
448 define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i32> %in) {
450 %0 = trunc <4 x i32> %in to <4 x i8>
451 store <4 x i8> %0, ptr addrspace(5) %out, align 2
455 ; floating-point store
456 ; FUNC-LABEL: {{^}}store_f32:
458 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
461 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
463 ; SI: buffer_store_dword
465 define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
466 store float %in, ptr addrspace(5) %out
470 ; FUNC-LABEL: {{^}}store_v4i16:
472 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
474 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
477 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
479 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
482 ; XSI: buffer_store_dwordx2
483 ; SI: buffer_store_dword
484 ; SI: buffer_store_dword
485 define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
487 %0 = trunc <4 x i32> %in to <4 x i16>
488 store <4 x i16> %0, ptr addrspace(5) %out
492 ; vec2 floating-point stores
493 ; FUNC-LABEL: {{^}}store_v2f32:
495 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
497 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
500 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
502 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
505 ; XSI: buffer_store_dwordx2
506 ; SI: buffer_store_dword
507 ; SI: buffer_store_dword
509 define amdgpu_kernel void @store_v2f32(ptr addrspace(5) %out, float %a, float %b) {
511 %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
512 %1 = insertelement <2 x float> %0, float %b, i32 1
513 store <2 x float> %1, ptr addrspace(5) %out
517 ; FUNC-LABEL: {{^}}store_v3i32:
519 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
521 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
523 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
526 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
528 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
530 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
533 ; XSI-DAG: buffer_store_dwordx2
534 ; SI: buffer_store_dword
535 ; SI: buffer_store_dword
536 ; SI: buffer_store_dword
538 define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) nounwind {
539 store <3 x i32> %a, ptr addrspace(5) %out, align 16
543 ; FUNC-LABEL: {{^}}store_v4i32:
545 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
547 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
549 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
551 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
554 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
556 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
558 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
560 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
563 ; XSI: buffer_store_dwordx4
564 ; SI: buffer_store_dword
565 ; SI: buffer_store_dword
566 ; SI: buffer_store_dword
567 ; SI: buffer_store_dword
568 define amdgpu_kernel void @store_v4i32(ptr addrspace(5) %out, <4 x i32> %in) {
570 store <4 x i32> %in, ptr addrspace(5) %out
574 ; FUNC-LABEL: {{^}}store_v4i32_unaligned:
576 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
578 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
580 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
582 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
585 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
587 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
589 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
591 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
594 ; XSI: buffer_store_dwordx4
595 ; SI: buffer_store_dword
596 ; SI: buffer_store_dword
597 ; SI: buffer_store_dword
598 ; SI: buffer_store_dword
599 define amdgpu_kernel void @store_v4i32_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
601 store <4 x i32> %in, ptr addrspace(5) %out, align 4
606 ; FUNC-LABEL: {{^}}store_v4f32:
608 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
610 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
612 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
614 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
617 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
619 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
621 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
623 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
626 ; XSI: buffer_store_dwordx4
627 ; SI: buffer_store_dword
628 ; SI: buffer_store_dword
629 ; SI: buffer_store_dword
630 ; SI: buffer_store_dword
631 define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %in) {
632 %1 = load <4 x float>, ptr addrspace(5) %in
633 store <4 x float> %1, ptr addrspace(5) %out
637 ; FUNC-LABEL: {{^}}store_i64_i8:
639 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
641 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
644 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
646 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
648 ; SI: buffer_store_byte
649 define amdgpu_kernel void @store_i64_i8(ptr addrspace(5) %out, i64 %in) {
651 %0 = trunc i64 %in to i8
652 store i8 %0, ptr addrspace(5) %out
656 ; FUNC-LABEL: {{^}}store_i64_i16:
658 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
660 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
663 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
665 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
667 ; SI: buffer_store_short
668 define amdgpu_kernel void @store_i64_i16(ptr addrspace(5) %out, i64 %in) {
670 %0 = trunc i64 %in to i16
671 store i16 %0, ptr addrspace(5) %out
675 ; The stores in this function are combined by the optimizer to create a
676 ; 64-bit store with 32-bit alignment. This is legal and the legalizer
677 ; should not try to split the 64-bit store back into 2 32-bit stores.
679 ; FUNC-LABEL: {{^}}vecload2:
681 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
683 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
686 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
688 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
691 ; XSI: buffer_store_dwordx2
692 ; SI: buffer_store_dword
693 ; SI: buffer_store_dword
694 define amdgpu_kernel void @vecload2(ptr addrspace(5) nocapture %out, ptr addrspace(4) nocapture %mem) #0 {
696 %0 = load i32, ptr addrspace(4) %mem, align 4
697 %arrayidx1.i = getelementptr inbounds i32, ptr addrspace(4) %mem, i64 1
698 %1 = load i32, ptr addrspace(4) %arrayidx1.i, align 4
699 store i32 %0, ptr addrspace(5) %out, align 4
700 %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 1
701 store i32 %1, ptr addrspace(5) %arrayidx1, align 4
705 ; When i128 was a legal type this program generated cannot select errors:
707 ; FUNC-LABEL: {{^}}"i128-const-store":
709 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
711 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
713 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
715 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
718 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
720 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
722 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
724 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
727 ; XSI: buffer_store_dwordx4
728 ; SI: buffer_store_dword
729 ; SI: buffer_store_dword
730 ; SI: buffer_store_dword
731 ; SI: buffer_store_dword
732 define amdgpu_kernel void @i128-const-store(ptr addrspace(5) %out) {
734 store i32 1, ptr addrspace(5) %out, align 4
735 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 1
736 store i32 1, ptr addrspace(5) %arrayidx2, align 4
737 %arrayidx4 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 2
738 store i32 2, ptr addrspace(5) %arrayidx4, align 4
739 %arrayidx6 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 3
740 store i32 2, ptr addrspace(5) %arrayidx6, align 4
745 attributes #0 = { nounwind }