1 ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GCN %s
3 ; GCN-LABEL: {{^}}vector_clause:
4 ; GCN: global_load_dwordx4
5 ; GCN-NEXT: global_load_dwordx4
6 ; GCN-NEXT: global_load_dwordx4
7 ; GCN-NEXT: global_load_dwordx4
9 define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
11 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
12 %tmp2 = zext i32 %tmp to i64
13 %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2
14 %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16
15 %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2
16 %tmp6 = add nuw nsw i64 %tmp2, 1
17 %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6
18 %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
19 %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
20 %tmp10 = add nuw nsw i64 %tmp2, 2
21 %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
22 %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
23 %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
24 %tmp14 = add nuw nsw i64 %tmp2, 3
25 %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
26 %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
27 %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
28 store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16
29 store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
30 store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
31 store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
35 ; GCN-LABEL: {{^}}scalar_clause:
37 ; GCN-NEXT: s_load_dwordx2
39 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
40 ; GCN-NEXT: s_load_dwordx4
41 ; GCN-NEXT: s_load_dwordx4
42 ; GCN-NEXT: s_load_dwordx4
43 ; GCN-NEXT: s_load_dwordx4
44 define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
46 %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16
47 %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1
48 %tmp3 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp2, align 16
49 %tmp4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 1
50 %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 2
51 %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 16
52 %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 2
53 %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 3
54 %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
55 %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
56 store <4 x i32> %tmp, <4 x i32> addrspace(1)* %arg1, align 16
57 store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %tmp4, align 16
58 store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp7, align 16
59 store <4 x i32> %tmp9, <4 x i32> addrspace(1)* %tmp10, align 16
63 ; GCN-LABEL: {{^}}mubuf_clause:
64 ; GCN: buffer_load_dword
65 ; GCN-NEXT: buffer_load_dword
66 ; GCN-NEXT: buffer_load_dword
67 ; GCN-NEXT: buffer_load_dword
68 ; GCN-NEXT: buffer_load_dword
69 ; GCN-NEXT: buffer_load_dword
70 ; GCN-NEXT: buffer_load_dword
71 ; GCN-NEXT: buffer_load_dword
72 ; GCN-NEXT: buffer_load_dword
73 ; GCN-NEXT: buffer_load_dword
74 ; GCN-NEXT: buffer_load_dword
75 ; GCN-NEXT: buffer_load_dword
76 ; GCN-NEXT: buffer_load_dword
77 ; GCN-NEXT: buffer_load_dword
78 ; GCN-NEXT: buffer_load_dword
81 ; GCN-NEXT: buffer_load_dword
82 define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %arg, <4 x i32> addrspace(5)* noalias nocapture %arg1) {
84 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
85 %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp
86 %tmp3 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp2, align 16
87 %tmp4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp
88 %tmp5 = add nuw nsw i32 %tmp, 1
89 %tmp6 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp5
90 %tmp7 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp6, align 16
91 %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp5
92 %tmp9 = add nuw nsw i32 %tmp, 2
93 %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp9
94 %tmp11 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp10, align 16
95 %tmp12 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp9
96 %tmp13 = add nuw nsw i32 %tmp, 3
97 %tmp14 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp13
98 %tmp15 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp14, align 16
99 %tmp16 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp13
100 store <4 x i32> %tmp3, <4 x i32> addrspace(5)* %tmp4, align 16
101 store <4 x i32> %tmp7, <4 x i32> addrspace(5)* %tmp8, align 16
102 store <4 x i32> %tmp11, <4 x i32> addrspace(5)* %tmp12, align 16
103 store <4 x i32> %tmp15, <4 x i32> addrspace(5)* %tmp16, align 16
107 ; GCN-LABEL: {{^}}vector_clause_indirect:
108 ; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}]
110 ; GCN-NEXT: s_waitcnt vmcnt(0)
112 ; GCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off
113 ; GCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off offset:16
114 define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture readnone %arg1, <4 x i32> addrspace(1)* noalias nocapture %arg2) {
116 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
117 %tmp3 = zext i32 %tmp to i64
118 %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp3
119 %tmp5 = bitcast i64 addrspace(1)* %tmp4 to <4 x i32> addrspace(1)* addrspace(1)*
120 %tmp6 = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(1)* %tmp5, align 8
121 %tmp7 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp6, align 16
122 %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %tmp6, i64 1
123 %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
124 store <4 x i32> %tmp7, <4 x i32> addrspace(1)* %arg2, align 16
125 %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg2, i64 1
126 store <4 x i32> %tmp9, <4 x i32> addrspace(1)* %tmp10, align 16
130 ; GCN-LABEL: {{^}}load_global_d16_hi:
131 ; GCN: global_load_short_d16_hi v
134 ; GCN-NEXT: global_load_short_d16_hi v
135 define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrspace(1)* %out) {
137 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32
138 %load1 = load i16, i16 addrspace(1)* %in
139 %load2 = load i16, i16 addrspace(1)* %gep
140 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
141 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
142 store <2 x i16> %build1, <2 x i16> addrspace(1)* %out
143 %build2 = insertelement <2 x i16> undef, i16 %reg, i32 0
144 %build3 = insertelement <2 x i16> %build2, i16 %load2, i32 1
145 %gep2 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 32
146 store <2 x i16> %build3, <2 x i16> addrspace(1)* %gep2
150 ; GCN-LABEL: {{^}}load_global_d16_lo:
151 ; GCN: global_load_short_d16 v
154 ; GCN-NEXT: global_load_short_d16 v
155 define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrspace(1)* %out) {
157 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32
158 %reg.bc1 = bitcast i32 %reg to <2 x i16>
159 %reg.bc2 = bitcast i32 %reg to <2 x i16>
160 %load1 = load i16, i16 addrspace(1)* %in
161 %load2 = load i16, i16 addrspace(1)* %gep
162 %build1 = insertelement <2 x i16> %reg.bc1, i16 %load1, i32 0
163 %build2 = insertelement <2 x i16> %reg.bc2, i16 %load2, i32 0
164 %gep2 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 32
165 store <2 x i16> %build1, <2 x i16> addrspace(1)* %out
166 store <2 x i16> %build2, <2 x i16> addrspace(1)* %gep2
170 declare i32 @llvm.amdgcn.workitem.id.x()