1 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4 declare i64 @_Z13get_global_idj(i32)
6 define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
7 ; GCN-LABEL: clmem_read_simplified:
8 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
9 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
10 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
11 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
12 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
13 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
14 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
15 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
17 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
18 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
19 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
20 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
21 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
22 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
23 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
24 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
26 %call = tail call i64 @_Z13get_global_idj(i32 0)
27 %conv = and i64 %call, 255
28 %a0 = shl i64 %call, 7
29 %idx.ext11 = and i64 %a0, 4294934528
30 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
31 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
33 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
34 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
35 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
36 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
37 %add.1 = add i64 %load2, %load1
39 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
40 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
41 %add.2 = add i64 %load3, %add.1
42 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
43 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
44 %add.3 = add i64 %load4, %add.2
46 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
47 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
48 %add.4 = add i64 %load5, %add.3
49 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
50 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
51 %add.5 = add i64 %load6, %add.4
53 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
54 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
55 %add.6 = add i64 %load7, %add.5
56 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
57 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
58 %add.7 = add i64 %load8, %add.6
60 store i64 %add.7, i64 addrspace(1)* %saddr, align 8
64 define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
65 ; GCN-LABEL: clmem_read:
66 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
67 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
68 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
69 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
70 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
71 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
72 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
73 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
74 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
75 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
76 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
78 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
79 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
80 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
81 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
82 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
83 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
84 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
85 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
86 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
87 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
88 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
90 %call = tail call i64 @_Z13get_global_idj(i32 0)
91 %conv = and i64 %call, 255
92 %a0 = shl i64 %call, 17
93 %idx.ext11 = and i64 %a0, 4261412864
94 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
95 %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
96 %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
97 br label %for.cond.preheader
99 while.cond.loopexit: ; preds = %for.body
100 %dec = add nsw i32 %dec31, -1
101 %tobool = icmp eq i32 %dec31, 0
102 br i1 %tobool, label %while.end, label %for.cond.preheader
104 for.cond.preheader: ; preds = %entry, %while.cond.loopexit
105 %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
106 %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
109 for.body: ; preds = %for.body, %for.cond.preheader
110 %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
111 %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
112 %conv3 = zext i32 %block.029 to i64
113 %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
114 %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
115 %add = add i64 %load1, %sum.128
117 %add9 = or i32 %block.029, 256
118 %conv3.1 = zext i32 %add9 to i64
119 %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
120 %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
121 %add.1 = add i64 %load2, %add
123 %add9.1 = or i32 %block.029, 512
124 %conv3.2 = zext i32 %add9.1 to i64
125 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
126 %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
127 %add.2 = add i64 %l3, %add.1
129 %add9.2 = or i32 %block.029, 768
130 %conv3.3 = zext i32 %add9.2 to i64
131 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
132 %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
133 %add.3 = add i64 %l4, %add.2
135 %add9.3 = or i32 %block.029, 1024
136 %conv3.4 = zext i32 %add9.3 to i64
137 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
138 %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
139 %add.4 = add i64 %l5, %add.3
141 %add9.4 = or i32 %block.029, 1280
142 %conv3.5 = zext i32 %add9.4 to i64
143 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
144 %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
145 %add.5 = add i64 %l6, %add.4
147 %add9.5 = or i32 %block.029, 1536
148 %conv3.6 = zext i32 %add9.5 to i64
149 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
150 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
151 %add.6 = add i64 %load7, %add.5
153 %add9.6 = or i32 %block.029, 1792
154 %conv3.7 = zext i32 %add9.6 to i64
155 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
156 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
157 %add.7 = add i64 %load8, %add.6
159 %add9.7 = or i32 %block.029, 2048
160 %conv3.8 = zext i32 %add9.7 to i64
161 %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
162 %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
163 %add.8 = add i64 %load9, %add.7
165 %add9.8 = or i32 %block.029, 2304
166 %conv3.9 = zext i32 %add9.8 to i64
167 %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
168 %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
169 %add.9 = add i64 %load10, %add.8
171 %add9.9 = or i32 %block.029, 2560
172 %conv3.10 = zext i32 %add9.9 to i64
173 %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
174 %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
175 %add.10 = add i64 %load11, %add.9
177 %add9.31 = add nuw nsw i32 %block.029, 8192
178 %cmp.31 = icmp ult i32 %add9.31, 4194304
179 br i1 %cmp.31, label %for.body, label %while.cond.loopexit
181 while.end: ; preds = %while.cond.loopexit
182 store i64 %add.10, i64 addrspace(1)* %a1, align 8
186 ; using 32bit address.
187 define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
188 ; GCN-LABEL: Address32:
189 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
190 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
191 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
192 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
193 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
194 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
195 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
196 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
197 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
198 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
200 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
201 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
202 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
203 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
204 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
205 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096
206 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072
207 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
208 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
209 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
211 %call = tail call i64 @_Z13get_global_idj(i32 0)
212 %conv = and i64 %call, 255
213 %id = shl i64 %call, 7
214 %idx.ext11 = and i64 %id, 4294934528
215 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
216 %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
218 %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
219 %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
221 %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
222 %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
223 %add.1 = add i32 %load2, %load1
225 %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
226 %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
227 %add.2 = add i32 %load3, %add.1
229 %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
230 %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
231 %add.3 = add i32 %load4, %add.2
233 %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
234 %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
235 %add.4 = add i32 %load5, %add.3
237 %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
238 %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
239 %add.5 = add i32 %load6, %add.4
241 %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
242 %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
243 %add.6 = add i32 %load7, %add.5
245 %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
246 %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
247 %add.7 = add i32 %load8, %add.6
249 %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
250 %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
251 %add.8 = add i32 %load9, %add.7
253 %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
254 %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
255 %add.9 = add i32 %load10, %add.8
257 store i32 %add.9, i32 addrspace(1)* %addr, align 4
261 define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
262 ; GCN-LABEL: Offset64:
263 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
264 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
265 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
266 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
268 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
269 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
270 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
271 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
273 %call = tail call i64 @_Z13get_global_idj(i32 0)
274 %conv = and i64 %call, 255
275 %a0 = shl i64 %call, 7
276 %idx.ext11 = and i64 %a0, 4294934528
277 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
278 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
280 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
281 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
283 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
284 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
286 %add1 = add i64 %load2, %load1
288 %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
289 %load3 = load i64, i64 addrspace(1)* %addr3, align 8
291 %add2 = add i64 %load3, %add1
293 %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
294 %load4 = load i64, i64 addrspace(1)* %addr4, align 8
295 %add4 = add i64 %load4, %add2
297 store i64 %add4, i64 addrspace(1)* %saddr, align 8
301 ; TODO: Support load4 as anchor instruction.
302 define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) {
303 ; GCN-LABEL: p32Offset64:
304 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
305 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
306 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
307 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
309 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
310 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
311 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
312 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
314 %call = tail call i64 @_Z13get_global_idj(i32 0)
315 %conv = and i64 %call, 255
316 %a0 = shl i64 %call, 7
317 %idx.ext11 = and i64 %a0, 4294934528
318 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
319 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
321 %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
322 %load1 = load i32, i32 addrspace(1)* %addr1, align 8
324 %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
325 %load2 = load i32, i32 addrspace(1)* %addr2, align 8
327 %add1 = add i32 %load2, %load1
329 %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
330 %load3 = load i32, i32 addrspace(1)* %addr3, align 8
332 %add2 = add i32 %load3, %add1
334 %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
335 %load4 = load i32, i32 addrspace(1)* %addr4, align 8
336 %add4 = add i32 %load4, %add2
338 store i32 %add4, i32 addrspace(1)* %saddr, align 8
342 define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
343 ; GCN-LABEL: DiffBase:
344 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
345 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
346 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
347 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
348 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
349 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
351 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
352 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
353 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
354 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
355 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
356 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
357 i8 addrspace(1)* %buffer2) {
359 %call = tail call i64 @_Z13get_global_idj(i32 0)
360 %conv = and i64 %call, 255
361 %a0 = shl i64 %call, 7
362 %idx.ext11 = and i64 %a0, 4294934528
363 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
364 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
366 %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
367 %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
369 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
370 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
371 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
372 %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
373 %add1 = add i64 %load2, %load1
374 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
375 %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
376 %add2 = add i64 %load3, %add1
378 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
379 %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
381 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
382 %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
383 %add3 = add i64 %load5, %load4
385 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
386 %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
387 %add4 = add i64 %load6, %add3
389 %add5 = add i64 %add2, %add4
391 store i64 %add5, i64 addrspace(1)* %saddr, align 8
395 define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
396 ; GCN-LABEL: ReverseOrder:
397 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
398 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
399 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
400 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
401 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
402 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
403 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
404 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
406 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
407 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
408 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
409 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
410 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
411 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
412 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
413 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
415 %call = tail call i64 @_Z13get_global_idj(i32 0)
416 %conv = and i64 %call, 255
417 %a0 = shl i64 %call, 7
418 %idx.ext11 = and i64 %a0, 4294934528
419 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
420 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
422 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
423 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
425 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
426 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
427 %add7 = add i64 %load8, %load1
429 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
430 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
431 %add6 = add i64 %load7, %add7
433 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
434 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
435 %add5 = add i64 %load6, %add6
437 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
438 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
439 %add4 = add i64 %load5, %add5
441 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
442 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
443 %add3 = add i64 %load4, %add4
445 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
446 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
447 %add2 = add i64 %load3, %add3
449 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
450 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
451 %add1 = add i64 %load2, %add2
453 store i64 %add1, i64 addrspace(1)* %saddr, align 8
457 define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
458 ; GCN-LABEL: negativeoffset:
459 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
460 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
462 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
463 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
465 %call = tail call i64 @_Z13get_global_idj(i32 0) #2
466 %conv = and i64 %call, 255
467 %0 = shl i64 %call, 7
468 %idx.ext11 = and i64 %0, 4294934528
469 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
470 %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
472 %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
474 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
475 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
477 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
478 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
481 %add = add i64 %load2, %load1
483 store i64 %add, i64 addrspace(1)* %buffer_head, align 8