1 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
5 ; SI-LABEL: {{^}}local_unaligned_load_store_i16:
11 define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
12 %v = load i16, i16 addrspace(3)* %p, align 1
13 store i16 %v, i16 addrspace(3)* %r, align 1
17 ; SI-LABEL: {{^}}global_unaligned_load_store_i16:
18 ; ALIGNED: buffer_load_ubyte
19 ; ALIGNED: buffer_load_ubyte
20 ; ALIGNED: buffer_store_byte
21 ; ALIGNED: buffer_store_byte
23 ; UNALIGNED: buffer_load_ushort
24 ; UNALIGNED: buffer_store_short
26 define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
27 %v = load i16, i16 addrspace(1)* %p, align 1
28 store i16 %v, i16 addrspace(1)* %r, align 1
32 ; FUNC-LABEL: {{^}}local_unaligned_load_store_i32:
45 define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
46 %v = load i32, i32 addrspace(3)* %p, align 1
47 store i32 %v, i32 addrspace(3)* %r, align 1
51 ; SI-LABEL: {{^}}global_unaligned_load_store_i32:
52 ; ALIGNED: buffer_load_ubyte
53 ; ALIGNED: buffer_load_ubyte
54 ; ALIGNED: buffer_load_ubyte
55 ; ALIGNED: buffer_load_ubyte
56 ; ALIGNED: buffer_store_byte
57 ; ALIGNED: buffer_store_byte
58 ; ALIGNED: buffer_store_byte
59 ; ALIGNED: buffer_store_byte
61 ; UNALIGNED: buffer_load_dword
62 ; UNALIGNED: buffer_store_dword
63 define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
64 %v = load i32, i32 addrspace(1)* %p, align 1
65 store i32 %v, i32 addrspace(1)* %r, align 1
69 ; SI-LABEL: {{^}}global_align2_load_store_i32:
70 ; ALIGNED: buffer_load_ushort
71 ; ALIGNED: buffer_load_ushort
72 ; ALIGNED: buffer_store_short
73 ; ALIGNED: buffer_store_short
75 ; UNALIGNED: buffer_load_dword
76 ; UNALIGNED: buffer_store_dword
77 define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
78 %v = load i32, i32 addrspace(1)* %p, align 2
79 store i32 %v, i32 addrspace(1)* %r, align 2
83 ; FUNC-LABEL: {{^}}local_align2_load_store_i32:
88 define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
89 %v = load i32, i32 addrspace(3)* %p, align 2
90 store i32 %v, i32 addrspace(3)* %r, align 2
94 ; FUNC-LABEL: {{^}}local_unaligned_load_store_i64:
135 define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
136 %v = load i64, i64 addrspace(3)* %p, align 1
137 store i64 %v, i64 addrspace(3)* %r, align 1
141 ; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
182 define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
183 %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
184 store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
188 ; SI-LABEL: {{^}}global_align2_load_store_i64:
189 ; ALIGNED: buffer_load_ushort
190 ; ALIGNED: buffer_load_ushort
193 ; ALIGNED-NOT: v_lshl
195 ; ALIGNED: buffer_load_ushort
198 ; ALIGNED-NOT: v_lshl
200 ; ALIGNED: buffer_load_ushort
203 ; ALIGNED-NOT: v_lshl
205 ; ALIGNED: buffer_store_short
206 ; ALIGNED: buffer_store_short
207 ; ALIGNED: buffer_store_short
208 ; ALIGNED: buffer_store_short
210 ; UNALIGNED: buffer_load_dwordx2
211 ; UNALIGNED: buffer_store_dwordx2
212 define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
213 %v = load i64, i64 addrspace(1)* %p, align 2
214 store i64 %v, i64 addrspace(1)* %r, align 2
218 ; SI-LABEL: {{^}}unaligned_load_store_i64_global:
219 ; ALIGNED: buffer_load_ubyte
220 ; ALIGNED: buffer_load_ubyte
221 ; ALIGNED: buffer_load_ubyte
222 ; ALIGNED: buffer_load_ubyte
223 ; ALIGNED: buffer_load_ubyte
224 ; ALIGNED: buffer_load_ubyte
225 ; ALIGNED: buffer_load_ubyte
226 ; ALIGNED: buffer_load_ubyte
229 ; ALIGNED-NOT: v_lshl
231 ; ALIGNED: buffer_store_byte
232 ; ALIGNED: buffer_store_byte
233 ; ALIGNED: buffer_store_byte
234 ; ALIGNED: buffer_store_byte
235 ; ALIGNED: buffer_store_byte
236 ; ALIGNED: buffer_store_byte
237 ; ALIGNED: buffer_store_byte
238 ; ALIGNED: buffer_store_byte
240 ; UNALIGNED: buffer_load_dwordx2
241 ; UNALIGNED: buffer_store_dwordx2
242 define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
243 %v = load i64, i64 addrspace(1)* %p, align 1
244 store i64 %v, i64 addrspace(1)* %r, align 1
248 ; FUNC-LABEL: {{^}}local_unaligned_load_store_v4i32:
289 define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
290 %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
291 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
295 ; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
296 ; ALIGNED: buffer_load_ubyte
297 ; ALIGNED: buffer_load_ubyte
298 ; ALIGNED: buffer_load_ubyte
299 ; ALIGNED: buffer_load_ubyte
300 ; ALIGNED: buffer_load_ubyte
301 ; ALIGNED: buffer_load_ubyte
302 ; ALIGNED: buffer_load_ubyte
303 ; ALIGNED: buffer_load_ubyte
304 ; ALIGNED: buffer_load_ubyte
305 ; ALIGNED: buffer_load_ubyte
306 ; ALIGNED: buffer_load_ubyte
307 ; ALIGNED: buffer_load_ubyte
308 ; ALIGNED: buffer_load_ubyte
309 ; ALIGNED: buffer_load_ubyte
310 ; ALIGNED: buffer_load_ubyte
311 ; ALIGNED: buffer_load_ubyte
313 ; ALIGNED: buffer_store_byte
314 ; ALIGNED: buffer_store_byte
315 ; ALIGNED: buffer_store_byte
316 ; ALIGNED: buffer_store_byte
317 ; ALIGNED: buffer_store_byte
318 ; ALIGNED: buffer_store_byte
319 ; ALIGNED: buffer_store_byte
320 ; ALIGNED: buffer_store_byte
321 ; ALIGNED: buffer_store_byte
322 ; ALIGNED: buffer_store_byte
323 ; ALIGNED: buffer_store_byte
324 ; ALIGNED: buffer_store_byte
325 ; ALIGNED: buffer_store_byte
326 ; ALIGNED: buffer_store_byte
327 ; ALIGNED: buffer_store_byte
328 ; ALIGNED: buffer_store_byte
330 ; UNALIGNED: buffer_load_dwordx4
331 ; UNALIGNED: buffer_store_dwordx4
332 define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
333 %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
334 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
338 ; FUNC-LABEL: {{^}}local_load_i64_align_4:
340 define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
341 %val = load i64, i64 addrspace(3)* %in, align 4
342 store i64 %val, i64 addrspace(1)* %out, align 8
346 ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
347 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
348 define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
349 %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
350 %val = load i64, i64 addrspace(3)* %ptr, align 4
351 store i64 %val, i64 addrspace(1)* %out, align 8
355 ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_split_offset:
356 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
357 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
359 define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
360 %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
361 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
362 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
363 %val = load i64, i64 addrspace(3)* %ptri64, align 4
364 store i64 %val, i64 addrspace(1)* %out, align 8
368 ; FUNC-LABEL: {{^}}local_load_i64_align_1:
378 define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
379 %val = load i64, i64 addrspace(3)* %in, align 1
380 store i64 %val, i64 addrspace(1)* %out, align 8
384 ; FUNC-LABEL: {{^}}local_store_i64_align_4:
386 define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
387 store i64 %val, i64 addrspace(3)* %out, align 4
391 ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
392 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
394 define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
395 %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
396 store i64 0, i64 addrspace(3)* %ptr, align 4
400 ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_split_offset:
401 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
402 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
404 define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
405 %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
406 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
407 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
408 store i64 0, i64 addrspace(3)* %out, align 4
412 ; SI-LABEL: {{^}}constant_unaligned_load_i32:
413 ; ALIGNED: buffer_load_ubyte
414 ; ALIGNED: buffer_load_ubyte
415 ; ALIGNED: buffer_load_ubyte
416 ; ALIGNED: buffer_load_ubyte
418 ; UNALIGNED: s_load_dword
420 ; SI: buffer_store_dword
421 define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
422 %v = load i32, i32 addrspace(4)* %p, align 1
423 store i32 %v, i32 addrspace(1)* %r, align 4
427 ; SI-LABEL: {{^}}constant_align2_load_i32:
428 ; ALIGNED: buffer_load_ushort
429 ; ALIGNED: buffer_load_ushort
431 ; UNALIGNED: s_load_dword
432 ; UNALIGNED: buffer_store_dword
433 define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
434 %v = load i32, i32 addrspace(4)* %p, align 2
435 store i32 %v, i32 addrspace(1)* %r, align 4
439 ; SI-LABEL: {{^}}constant_align2_load_i64:
440 ; ALIGNED: buffer_load_ushort
441 ; ALIGNED: buffer_load_ushort
442 ; ALIGNED: buffer_load_ushort
443 ; ALIGNED: buffer_load_ushort
445 ; UNALIGNED: s_load_dwordx4
446 ; UNALIGNED: buffer_store_dwordx2
447 define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
448 %v = load i64, i64 addrspace(4)* %p, align 2
449 store i64 %v, i64 addrspace(1)* %r, align 4
453 ; SI-LABEL: {{^}}constant_align4_load_i64:
455 ; SI: buffer_store_dwordx2
456 define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
457 %v = load i64, i64 addrspace(4)* %p, align 4
458 store i64 %v, i64 addrspace(1)* %r, align 4
462 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
464 ; SI: buffer_store_dwordx4
465 define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
466 %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 4
467 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
471 ; SI-LABEL: {{^}}constant_unaligned_load_v2i32:
472 ; ALIGNED: buffer_load_ubyte
473 ; ALIGNED: buffer_load_ubyte
474 ; ALIGNED: buffer_load_ubyte
475 ; ALIGNED: buffer_load_ubyte
477 ; ALIGNED: buffer_load_ubyte
478 ; ALIGNED: buffer_load_ubyte
479 ; ALIGNED: buffer_load_ubyte
480 ; ALIGNED: buffer_load_ubyte
482 ; UNALIGNED: buffer_load_dwordx2
484 ; SI: buffer_store_dwordx2
485 define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)* %p, <2 x i32> addrspace(1)* %r) #0 {
486 %v = load <2 x i32>, <2 x i32> addrspace(4)* %p, align 1
487 store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
491 ; SI-LABEL: {{^}}constant_unaligned_load_v4i32:
492 ; ALIGNED: buffer_load_ubyte
493 ; ALIGNED: buffer_load_ubyte
494 ; ALIGNED: buffer_load_ubyte
495 ; ALIGNED: buffer_load_ubyte
497 ; ALIGNED: buffer_load_ubyte
498 ; ALIGNED: buffer_load_ubyte
499 ; ALIGNED: buffer_load_ubyte
500 ; ALIGNED: buffer_load_ubyte
502 ; ALIGNED: buffer_load_ubyte
503 ; ALIGNED: buffer_load_ubyte
504 ; ALIGNED: buffer_load_ubyte
505 ; ALIGNED: buffer_load_ubyte
507 ; ALIGNED: buffer_load_ubyte
508 ; ALIGNED: buffer_load_ubyte
509 ; ALIGNED: buffer_load_ubyte
510 ; ALIGNED: buffer_load_ubyte
512 ; UNALIGNED: buffer_load_dwordx4
514 ; SI: buffer_store_dwordx4
515 define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
516 %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 1
517 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
521 ; SI-LABEL: {{^}}constant_align4_load_i8:
523 ; SI: buffer_store_byte
524 define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
525 %v = load i8, i8 addrspace(4)* %p, align 4
526 store i8 %v, i8 addrspace(1)* %r, align 4
530 ; SI-LABEL: {{^}}constant_align2_load_i8:
531 ; SI: buffer_load_ubyte
532 ; SI: buffer_store_byte
533 define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
534 %v = load i8, i8 addrspace(4)* %p, align 2
535 store i8 %v, i8 addrspace(1)* %r, align 2
539 ; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
540 ; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
541 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
542 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
543 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
544 define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
545 %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1
546 %v0 = load i32, i32 addrspace(4)* %p, align 4
547 %v1 = load i32, i32 addrspace(4)* %gep0, align 4
549 %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1
550 store i32 %v0, i32 addrspace(1)* %r, align 4
551 store i32 %v1, i32 addrspace(1)* %gep1, align 4
555 ; SI-LABEL: {{^}}local_load_align1_v16i8:
573 ; SI: ScratchSize: 0{{$}}
574 define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
575 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
576 store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
580 ; SI-LABEL: {{^}}local_store_align1_v16i8:
598 ; SI: ScratchSize: 0{{$}}
599 define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
600 store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
604 ; SI-LABEL: {{^}}private_load_align1_f64:
605 ; SI: buffer_load_ubyte
606 ; SI: buffer_load_ubyte
607 ; SI: buffer_load_ubyte
608 ; SI: buffer_load_ubyte
609 ; SI: buffer_load_ubyte
610 ; SI: buffer_load_ubyte
611 ; SI: buffer_load_ubyte
612 ; SI: buffer_load_ubyte
613 define double @private_load_align1_f64(double addrspace(5)* %in) {
614 %x = load double, double addrspace(5)* %in, align 1
618 ; SI-LABEL: {{^}}private_store_align1_f64:
619 ; SI: buffer_store_byte
620 ; SI: buffer_store_byte
621 ; SI: buffer_store_byte
622 ; SI: buffer_store_byte
623 ; SI: buffer_store_byte
624 ; SI: buffer_store_byte
625 ; SI: buffer_store_byte
626 ; SI: buffer_store_byte
627 define void @private_store_align1_f64(double addrspace(5)* %out, double %x) #0 {
628 store double %x, double addrspace(5)* %out, align 1
632 ; SI-LABEL: {{^}}private_load_align4_f64:
633 ; SI: buffer_load_dword
634 ; SI: buffer_load_dword
635 define double @private_load_align4_f64(double addrspace(5)* %in) {
636 %x = load double, double addrspace(5)* %in, align 4
640 ; SI-LABEL: {{^}}private_store_align4_f64:
641 ; SI: buffer_store_dword
642 ; SI: buffer_store_dword
643 define void @private_store_align4_f64(double addrspace(5)* %out, double %x) #0 {
644 store double %x, double addrspace(5)* %out, align 4
648 ; SI-LABEL: {{^}}private_load_align2_f64:
649 ; SI: buffer_load_ushort
650 ; SI: buffer_load_ushort
651 ; SI: buffer_load_ushort
652 ; SI: buffer_load_ushort
653 define double @private_load_align2_f64(double addrspace(5)* %in) {
654 %x = load double, double addrspace(5)* %in, align 2
658 ; SI-LABEL: {{^}}private_store_align2_f64:
659 ; SI: buffer_store_short
660 ; SI: buffer_store_short
661 ; SI: buffer_store_short
662 ; SI: buffer_store_short
663 define void @private_store_align2_f64(double addrspace(5)* %out, double %x) #0 {
664 store double %x, double addrspace(5)* %out, align 2
668 attributes #0 = { nounwind }