1 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-access-mode -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,UNALIGNED %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FLATSCR,ALIGNED %s
6 ; SI-LABEL: {{^}}local_unaligned_load_store_i16:
12 define amdgpu_kernel void @local_unaligned_load_store_i16(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
13 %v = load i16, ptr addrspace(3) %p, align 1
14 store i16 %v, ptr addrspace(3) %r, align 1
18 ; SI-LABEL: {{^}}global_unaligned_load_store_i16:
19 ; ALIGNED: buffer_load_ubyte
20 ; ALIGNED: buffer_load_ubyte
21 ; ALIGNED: buffer_store_byte
22 ; ALIGNED: buffer_store_byte
24 ; UNALIGNED: buffer_load_ushort
25 ; UNALIGNED: buffer_store_short
27 define amdgpu_kernel void @global_unaligned_load_store_i16(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
28 %v = load i16, ptr addrspace(1) %p, align 1
29 store i16 %v, ptr addrspace(1) %r, align 1
33 ; SI-LABEL: {{^}}local_unaligned_load_store_i32:
46 define amdgpu_kernel void @local_unaligned_load_store_i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
47 %v = load i32, ptr addrspace(3) %p, align 1
48 store i32 %v, ptr addrspace(3) %r, align 1
52 ; SI-LABEL: {{^}}global_unaligned_load_store_i32:
53 ; ALIGNED: buffer_load_ubyte
54 ; ALIGNED: buffer_load_ubyte
55 ; ALIGNED: buffer_load_ubyte
56 ; ALIGNED: buffer_load_ubyte
57 ; ALIGNED: buffer_store_byte
58 ; ALIGNED: buffer_store_byte
59 ; ALIGNED: buffer_store_byte
60 ; ALIGNED: buffer_store_byte
62 ; UNALIGNED: buffer_load_dword
63 ; UNALIGNED: buffer_store_dword
64 define amdgpu_kernel void @global_unaligned_load_store_i32(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
65 %v = load i32, ptr addrspace(1) %p, align 1
66 store i32 %v, ptr addrspace(1) %r, align 1
70 ; SI-LABEL: {{^}}global_align2_load_store_i32:
71 ; ALIGNED: buffer_load_ushort
72 ; ALIGNED: buffer_load_ushort
73 ; ALIGNED: buffer_store_short
74 ; ALIGNED: buffer_store_short
76 ; UNALIGNED: buffer_load_dword
77 ; UNALIGNED: buffer_store_dword
78 define amdgpu_kernel void @global_align2_load_store_i32(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
79 %v = load i32, ptr addrspace(1) %p, align 2
80 store i32 %v, ptr addrspace(1) %r, align 2
84 ; GCN-LABEL: {{^}}local_align2_load_store_i32:
89 define amdgpu_kernel void @local_align2_load_store_i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
90 %v = load i32, ptr addrspace(3) %p, align 2
91 store i32 %v, ptr addrspace(3) %r, align 2
95 ; SI-LABEL: {{^}}local_unaligned_load_store_i64:
136 define amdgpu_kernel void @local_unaligned_load_store_i64(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
137 %v = load i64, ptr addrspace(3) %p, align 1
138 store i64 %v, ptr addrspace(3) %r, align 1
142 ; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
183 define amdgpu_kernel void @local_unaligned_load_store_v2i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
184 %v = load <2 x i32>, ptr addrspace(3) %p, align 1
185 store <2 x i32> %v, ptr addrspace(3) %r, align 1
189 ; SI-LABEL: {{^}}global_align2_load_store_i64:
190 ; ALIGNED: buffer_load_ushort
191 ; ALIGNED: buffer_load_ushort
194 ; ALIGNED-NOT: v_lshl
196 ; ALIGNED: buffer_load_ushort
199 ; ALIGNED-NOT: v_lshl
201 ; ALIGNED: buffer_load_ushort
204 ; ALIGNED-NOT: v_lshl
206 ; ALIGNED: buffer_store_short
207 ; ALIGNED: buffer_store_short
208 ; ALIGNED: buffer_store_short
209 ; ALIGNED: buffer_store_short
211 ; UNALIGNED: buffer_load_dwordx2
212 ; UNALIGNED: buffer_store_dwordx2
213 define amdgpu_kernel void @global_align2_load_store_i64(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
214 %v = load i64, ptr addrspace(1) %p, align 2
215 store i64 %v, ptr addrspace(1) %r, align 2
219 ; SI-LABEL: {{^}}unaligned_load_store_i64_global:
220 ; ALIGNED: buffer_load_ubyte
221 ; ALIGNED: buffer_load_ubyte
222 ; ALIGNED: buffer_load_ubyte
223 ; ALIGNED: buffer_load_ubyte
224 ; ALIGNED: buffer_load_ubyte
225 ; ALIGNED: buffer_load_ubyte
226 ; ALIGNED: buffer_load_ubyte
227 ; ALIGNED: buffer_load_ubyte
230 ; ALIGNED-NOT: v_lshl
232 ; ALIGNED: buffer_store_byte
233 ; ALIGNED: buffer_store_byte
234 ; ALIGNED: buffer_store_byte
235 ; ALIGNED: buffer_store_byte
236 ; ALIGNED: buffer_store_byte
237 ; ALIGNED: buffer_store_byte
238 ; ALIGNED: buffer_store_byte
239 ; ALIGNED: buffer_store_byte
241 ; UNALIGNED: buffer_load_dwordx2
242 ; UNALIGNED: buffer_store_dwordx2
243 define amdgpu_kernel void @unaligned_load_store_i64_global(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
244 %v = load i64, ptr addrspace(1) %p, align 1
245 store i64 %v, ptr addrspace(1) %r, align 1
249 ; GCN-LABEL: {{^}}local_unaligned_load_store_v4i32:
290 define amdgpu_kernel void @local_unaligned_load_store_v4i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
291 %v = load <4 x i32>, ptr addrspace(3) %p, align 1
292 store <4 x i32> %v, ptr addrspace(3) %r, align 1
296 ; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
297 ; ALIGNED: buffer_load_ubyte
298 ; ALIGNED: buffer_load_ubyte
299 ; ALIGNED: buffer_load_ubyte
300 ; ALIGNED: buffer_load_ubyte
301 ; ALIGNED: buffer_load_ubyte
302 ; ALIGNED: buffer_load_ubyte
303 ; ALIGNED: buffer_load_ubyte
304 ; ALIGNED: buffer_load_ubyte
305 ; ALIGNED: buffer_load_ubyte
306 ; ALIGNED: buffer_load_ubyte
307 ; ALIGNED: buffer_load_ubyte
308 ; ALIGNED: buffer_load_ubyte
309 ; ALIGNED: buffer_load_ubyte
310 ; ALIGNED: buffer_load_ubyte
311 ; ALIGNED: buffer_load_ubyte
312 ; ALIGNED: buffer_load_ubyte
314 ; ALIGNED: buffer_store_byte
315 ; ALIGNED: buffer_store_byte
316 ; ALIGNED: buffer_store_byte
317 ; ALIGNED: buffer_store_byte
318 ; ALIGNED: buffer_store_byte
319 ; ALIGNED: buffer_store_byte
320 ; ALIGNED: buffer_store_byte
321 ; ALIGNED: buffer_store_byte
322 ; ALIGNED: buffer_store_byte
323 ; ALIGNED: buffer_store_byte
324 ; ALIGNED: buffer_store_byte
325 ; ALIGNED: buffer_store_byte
326 ; ALIGNED: buffer_store_byte
327 ; ALIGNED: buffer_store_byte
328 ; ALIGNED: buffer_store_byte
329 ; ALIGNED: buffer_store_byte
331 ; UNALIGNED: buffer_load_dwordx4
332 ; UNALIGNED: buffer_store_dwordx4
333 define amdgpu_kernel void @global_unaligned_load_store_v4i32(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
334 %v = load <4 x i32>, ptr addrspace(1) %p, align 1
335 store <4 x i32> %v, ptr addrspace(1) %r, align 1
339 ; GCN-LABEL: {{^}}local_load_i64_align_4:
341 define amdgpu_kernel void @local_load_i64_align_4(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
342 %val = load i64, ptr addrspace(3) %in, align 4
343 store i64 %val, ptr addrspace(1) %out, align 8
347 ; GCN-LABEL: {{^}}local_load_i64_align_4_with_offset
348 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
349 define amdgpu_kernel void @local_load_i64_align_4_with_offset(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
350 %ptr = getelementptr i64, ptr addrspace(3) %in, i32 4
351 %val = load i64, ptr addrspace(3) %ptr, align 4
352 store i64 %val, ptr addrspace(1) %out, align 8
356 ; GCN-LABEL: {{^}}local_load_i64_align_4_with_split_offset:
357 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
358 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
360 define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
361 %ptr255 = getelementptr i32, ptr addrspace(3) %in, i32 255
362 %val = load i64, ptr addrspace(3) %ptr255, align 4
363 store i64 %val, ptr addrspace(1) %out, align 8
367 ; GCN-LABEL: {{^}}local_load_i64_align_1:
377 define amdgpu_kernel void @local_load_i64_align_1(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
378 %val = load i64, ptr addrspace(3) %in, align 1
379 store i64 %val, ptr addrspace(1) %out, align 8
383 ; GCN-LABEL: {{^}}local_store_i64_align_4:
385 define amdgpu_kernel void @local_store_i64_align_4(ptr addrspace(3) %out, i64 %val) #0 {
386 store i64 %val, ptr addrspace(3) %out, align 4
390 ; GCN-LABEL: {{^}}local_store_i64_align_4_with_offset
391 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
393 define amdgpu_kernel void @local_store_i64_align_4_with_offset(ptr addrspace(3) %out) #0 {
394 %ptr = getelementptr i64, ptr addrspace(3) %out, i32 4
395 store i64 0, ptr addrspace(3) %ptr, align 4
399 ; GCN-LABEL: {{^}}local_store_i64_align_4_with_split_offset:
400 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
401 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
403 define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(ptr addrspace(3) %out) #0 {
404 %ptr255 = getelementptr i32, ptr addrspace(3) %out, i32 255
405 store i64 0, ptr addrspace(3) %out, align 4
409 ; SI-LABEL: {{^}}constant_unaligned_load_i32:
410 ; ALIGNED: buffer_load_ubyte
411 ; ALIGNED: buffer_load_ubyte
412 ; ALIGNED: buffer_load_ubyte
413 ; ALIGNED: buffer_load_ubyte
415 ; UNALIGNED: s_load_dword
417 ; SI: buffer_store_dword
418 define amdgpu_kernel void @constant_unaligned_load_i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
419 %v = load i32, ptr addrspace(4) %p, align 1
420 store i32 %v, ptr addrspace(1) %r, align 4
424 ; SI-LABEL: {{^}}constant_align2_load_i32:
425 ; ALIGNED: buffer_load_ushort
426 ; ALIGNED: buffer_load_ushort
428 ; UNALIGNED: s_load_dword
429 ; UNALIGNED: buffer_store_dword
430 define amdgpu_kernel void @constant_align2_load_i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
431 %v = load i32, ptr addrspace(4) %p, align 2
432 store i32 %v, ptr addrspace(1) %r, align 4
436 ; SI-LABEL: {{^}}constant_align2_load_i64:
437 ; ALIGNED: buffer_load_ushort
438 ; ALIGNED: buffer_load_ushort
439 ; ALIGNED: buffer_load_ushort
440 ; ALIGNED: buffer_load_ushort
442 ; UNALIGNED: s_load_dwordx4
443 ; UNALIGNED: buffer_store_dwordx2
444 define amdgpu_kernel void @constant_align2_load_i64(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
445 %v = load i64, ptr addrspace(4) %p, align 2
446 store i64 %v, ptr addrspace(1) %r, align 4
450 ; SI-LABEL: {{^}}constant_align4_load_i64:
452 ; SI: buffer_store_dwordx2
453 define amdgpu_kernel void @constant_align4_load_i64(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
454 %v = load i64, ptr addrspace(4) %p, align 4
455 store i64 %v, ptr addrspace(1) %r, align 4
459 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
461 ; SI: buffer_store_dwordx4
462 define amdgpu_kernel void @constant_align4_load_v4i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
463 %v = load <4 x i32>, ptr addrspace(4) %p, align 4
464 store <4 x i32> %v, ptr addrspace(1) %r, align 4
468 ; SI-LABEL: {{^}}constant_unaligned_load_v2i32:
469 ; ALIGNED: buffer_load_ubyte
470 ; ALIGNED: buffer_load_ubyte
471 ; ALIGNED: buffer_load_ubyte
472 ; ALIGNED: buffer_load_ubyte
474 ; ALIGNED: buffer_load_ubyte
475 ; ALIGNED: buffer_load_ubyte
476 ; ALIGNED: buffer_load_ubyte
477 ; ALIGNED: buffer_load_ubyte
479 ; UNALIGNED: buffer_load_dwordx2
481 ; SI: buffer_store_dwordx2
482 define amdgpu_kernel void @constant_unaligned_load_v2i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
483 %v = load <2 x i32>, ptr addrspace(4) %p, align 1
484 store <2 x i32> %v, ptr addrspace(1) %r, align 4
488 ; SI-LABEL: {{^}}constant_unaligned_load_v4i32:
489 ; ALIGNED: buffer_load_ubyte
490 ; ALIGNED: buffer_load_ubyte
491 ; ALIGNED: buffer_load_ubyte
492 ; ALIGNED: buffer_load_ubyte
494 ; ALIGNED: buffer_load_ubyte
495 ; ALIGNED: buffer_load_ubyte
496 ; ALIGNED: buffer_load_ubyte
497 ; ALIGNED: buffer_load_ubyte
499 ; ALIGNED: buffer_load_ubyte
500 ; ALIGNED: buffer_load_ubyte
501 ; ALIGNED: buffer_load_ubyte
502 ; ALIGNED: buffer_load_ubyte
504 ; ALIGNED: buffer_load_ubyte
505 ; ALIGNED: buffer_load_ubyte
506 ; ALIGNED: buffer_load_ubyte
507 ; ALIGNED: buffer_load_ubyte
509 ; UNALIGNED: buffer_load_dwordx4
511 ; SI: buffer_store_dwordx4
512 define amdgpu_kernel void @constant_unaligned_load_v4i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
513 %v = load <4 x i32>, ptr addrspace(4) %p, align 1
514 store <4 x i32> %v, ptr addrspace(1) %r, align 4
518 ; SI-LABEL: {{^}}constant_align4_load_i8:
520 ; SI: buffer_store_byte
521 define amdgpu_kernel void @constant_align4_load_i8(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
522 %v = load i8, ptr addrspace(4) %p, align 4
523 store i8 %v, ptr addrspace(1) %r, align 4
527 ; SI-LABEL: {{^}}constant_align2_load_i8:
528 ; SI: buffer_load_ubyte
529 ; SI: buffer_store_byte
530 define amdgpu_kernel void @constant_align2_load_i8(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
531 %v = load i8, ptr addrspace(4) %p, align 2
532 store i8 %v, ptr addrspace(1) %r, align 2
536 ; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
537 ; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
538 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
539 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
540 ; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
541 define amdgpu_kernel void @constant_align4_merge_load_2_i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
542 %gep0 = getelementptr i32, ptr addrspace(4) %p, i64 1
543 %v0 = load i32, ptr addrspace(4) %p, align 4
544 %v1 = load i32, ptr addrspace(4) %gep0, align 4
546 %gep1 = getelementptr i32, ptr addrspace(1) %r, i64 1
547 store i32 %v0, ptr addrspace(1) %r, align 4
548 store i32 %v1, ptr addrspace(1) %gep1, align 4
552 ; SI-LABEL: {{^}}local_load_align1_v16i8:
570 ; SI: ScratchSize: 0{{$}}
571 define amdgpu_kernel void @local_load_align1_v16i8(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
572 %ld = load <16 x i8>, ptr addrspace(3) %in, align 1
573 store <16 x i8> %ld, ptr addrspace(1) %out
577 ; SI-LABEL: {{^}}local_store_align1_v16i8:
595 ; SI: ScratchSize: 0{{$}}
596 define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 {
597 store <16 x i8> zeroinitializer, ptr addrspace(3) %out, align 1
601 ; SI-LABEL: {{^}}private_load_align1_f64:
602 ; MUBUF: buffer_load_ubyte
603 ; MUBUF: buffer_load_ubyte
604 ; MUBUF: buffer_load_ubyte
605 ; MUBUF: buffer_load_ubyte
606 ; MUBUF: buffer_load_ubyte
607 ; MUBUF: buffer_load_ubyte
608 ; MUBUF: buffer_load_ubyte
609 ; MUBUF: buffer_load_ubyte
610 ; FLATSCR: scratch_load_dwordx2
611 define double @private_load_align1_f64(ptr addrspace(5) %in) {
612 %x = load double, ptr addrspace(5) %in, align 1
616 ; SI-LABEL: {{^}}private_store_align1_f64:
617 ; MUBUF: buffer_store_byte
618 ; MUBUF: buffer_store_byte
619 ; MUBUF: buffer_store_byte
620 ; MUBUF: buffer_store_byte
621 ; MUBUF: buffer_store_byte
622 ; MUBUF: buffer_store_byte
623 ; MUBUF: buffer_store_byte
624 ; MUBUF: buffer_store_byte
625 ; FLATSCR: scratch_store_dwordx2
626 define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 {
627 store double %x, ptr addrspace(5) %out, align 1
631 ; SI-LABEL: {{^}}private_load_align4_f64:
632 ; MUBUF: buffer_load_dword
633 ; MUBUF: buffer_load_dword
634 ; FLATSCR: scratch_load_dwordx2
635 define double @private_load_align4_f64(ptr addrspace(5) %in) {
636 %x = load double, ptr addrspace(5) %in, align 4
640 ; SI-LABEL: {{^}}private_store_align4_f64:
641 ; MUBUF: buffer_store_dword
642 ; MUBUF: buffer_store_dword
643 ; FLATSCR: scratch_store_dwordx2
644 define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 {
645 store double %x, ptr addrspace(5) %out, align 4
649 ; SI-LABEL: {{^}}private_load_align2_f64:
650 ; MUBUF: buffer_load_ushort
651 ; MUBUF: buffer_load_ushort
652 ; MUBUF: buffer_load_ushort
653 ; MUBUF: buffer_load_ushort
654 ; FLATSCR: scratch_load_dwordx2
655 define double @private_load_align2_f64(ptr addrspace(5) %in) {
656 %x = load double, ptr addrspace(5) %in, align 2
660 ; SI-LABEL: {{^}}private_store_align2_f64:
661 ; MUBUF: buffer_store_short
662 ; MUBUF: buffer_store_short
663 ; MUBUF: buffer_store_short
664 ; MUBUF: buffer_store_short
665 ; FLATSCR: scratch_store_dwordx2
666 define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 {
667 store double %x, ptr addrspace(5) %out, align 2
671 ; Should not merge this to a dword store
672 define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
673 %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
674 %v = load i16, ptr addrspace(1) %p, align 2
675 store i16 1, ptr addrspace(1) %r, align 2
676 store i16 2, ptr addrspace(1) %gep.r, align 2
680 ; Should not merge this to a word load
681 define i32 @load_2xi16_align2(ptr addrspace(1) %p) #0 {
682 %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
683 %p.0 = load i16, ptr addrspace(1) %p, align 2
684 %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
685 %zext.0 = zext i16 %p.0 to i32
686 %zext.1 = zext i16 %p.1 to i32
687 %shl.1 = shl i32 %zext.1, 16
688 %or = or i32 %zext.0, %shl.1
692 attributes #0 = { nounwind }