1 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX90A %s
6 declare i64 @_Z13get_global_idj(i32)
8 define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
9 ; GCN-LABEL: clmem_read_simplified:
10 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
11 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
12 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
13 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
14 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
15 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
16 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
17 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
19 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
20 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
21 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
22 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
23 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
24 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
25 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
26 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
28 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
29 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
30 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
31 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
32 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
33 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
34 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
35 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
38 %call = tail call i64 @_Z13get_global_idj(i32 0)
39 %conv = and i64 %call, 255
40 %a0 = shl i64 %call, 7
41 %idx.ext11 = and i64 %a0, 4294934528
42 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
43 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
45 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
46 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
47 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
48 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
49 %add.1 = add i64 %load2, %load1
51 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
52 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
53 %add.2 = add i64 %load3, %add.1
54 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
55 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
56 %add.3 = add i64 %load4, %add.2
58 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
59 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
60 %add.4 = add i64 %load5, %add.3
61 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
62 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
63 %add.5 = add i64 %load6, %add.4
65 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
66 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
67 %add.6 = add i64 %load7, %add.5
68 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
69 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
70 %add.7 = add i64 %load8, %add.6
72 store i64 %add.7, i64 addrspace(1)* %saddr, align 8
76 define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
77 ; GCN-LABEL: clmem_read:
78 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
79 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
80 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
81 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
82 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
83 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
84 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
85 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
86 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
87 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
88 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
90 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
91 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
92 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
93 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
94 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
95 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
96 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
97 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
98 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
99 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
100 ; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
102 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
103 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
104 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
105 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
106 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
107 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
108 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
109 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
110 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
111 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
112 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
114 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
115 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
116 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
117 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
118 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
119 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
120 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
121 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
122 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
123 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
124 ; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
127 %call = tail call i64 @_Z13get_global_idj(i32 0)
128 %conv = and i64 %call, 255
129 %a0 = shl i64 %call, 17
130 %idx.ext11 = and i64 %a0, 4261412864
131 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
132 %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
133 %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
134 br label %for.cond.preheader
136 while.cond.loopexit: ; preds = %for.body
137 %dec = add nsw i32 %dec31, -1
138 %tobool = icmp eq i32 %dec31, 0
139 br i1 %tobool, label %while.end, label %for.cond.preheader
141 for.cond.preheader: ; preds = %entry, %while.cond.loopexit
142 %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
143 %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
146 for.body: ; preds = %for.body, %for.cond.preheader
147 %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
148 %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
149 %conv3 = zext i32 %block.029 to i64
150 %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
151 %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
152 %add = add i64 %load1, %sum.128
154 %add9 = or i32 %block.029, 256
155 %conv3.1 = zext i32 %add9 to i64
156 %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
157 %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
158 %add.1 = add i64 %load2, %add
160 %add9.1 = or i32 %block.029, 512
161 %conv3.2 = zext i32 %add9.1 to i64
162 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
163 %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
164 %add.2 = add i64 %l3, %add.1
166 %add9.2 = or i32 %block.029, 768
167 %conv3.3 = zext i32 %add9.2 to i64
168 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
169 %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
170 %add.3 = add i64 %l4, %add.2
172 %add9.3 = or i32 %block.029, 1024
173 %conv3.4 = zext i32 %add9.3 to i64
174 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
175 %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
176 %add.4 = add i64 %l5, %add.3
178 %add9.4 = or i32 %block.029, 1280
179 %conv3.5 = zext i32 %add9.4 to i64
180 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
181 %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
182 %add.5 = add i64 %l6, %add.4
184 %add9.5 = or i32 %block.029, 1536
185 %conv3.6 = zext i32 %add9.5 to i64
186 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
187 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
188 %add.6 = add i64 %load7, %add.5
190 %add9.6 = or i32 %block.029, 1792
191 %conv3.7 = zext i32 %add9.6 to i64
192 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
193 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
194 %add.7 = add i64 %load8, %add.6
196 %add9.7 = or i32 %block.029, 2048
197 %conv3.8 = zext i32 %add9.7 to i64
198 %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
199 %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
200 %add.8 = add i64 %load9, %add.7
202 %add9.8 = or i32 %block.029, 2304
203 %conv3.9 = zext i32 %add9.8 to i64
204 %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
205 %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
206 %add.9 = add i64 %load10, %add.8
208 %add9.9 = or i32 %block.029, 2560
209 %conv3.10 = zext i32 %add9.9 to i64
210 %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
211 %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
212 %add.10 = add i64 %load11, %add.9
214 %add9.31 = add nuw nsw i32 %block.029, 8192
215 %cmp.31 = icmp ult i32 %add9.31, 4194304
216 br i1 %cmp.31, label %for.body, label %while.cond.loopexit
218 while.end: ; preds = %while.cond.loopexit
219 store i64 %add.10, i64 addrspace(1)* %a1, align 8
223 ; using 32bit address.
224 define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
225 ; GCN-LABEL: Address32:
226 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
227 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
228 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
229 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
230 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
231 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
232 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
233 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
234 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
235 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
237 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
238 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
239 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
240 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
241 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
242 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
243 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
244 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
245 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
246 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
248 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
249 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
250 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
251 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
252 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
253 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
254 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
255 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
256 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
257 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
259 %call = tail call i64 @_Z13get_global_idj(i32 0)
260 %conv = and i64 %call, 255
261 %id = shl i64 %call, 7
262 %idx.ext11 = and i64 %id, 4294934528
263 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
264 %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
266 %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
267 %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
269 %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
270 %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
271 %add.1 = add i32 %load2, %load1
273 %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
274 %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
275 %add.2 = add i32 %load3, %add.1
277 %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
278 %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
279 %add.3 = add i32 %load4, %add.2
281 %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
282 %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
283 %add.4 = add i32 %load5, %add.3
285 %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
286 %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
287 %add.5 = add i32 %load6, %add.4
289 %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
290 %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
291 %add.6 = add i32 %load7, %add.5
293 %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
294 %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
295 %add.7 = add i32 %load8, %add.6
297 %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
298 %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
299 %add.8 = add i32 %load9, %add.7
301 %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
302 %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
303 %add.9 = add i32 %load10, %add.8
305 store i32 %add.9, i32 addrspace(1)* %addr, align 4
309 define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
310 ; GCN-LABEL: Offset64:
311 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
312 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
313 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
314 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
316 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
317 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
318 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
319 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
321 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
322 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
323 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
324 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
326 %call = tail call i64 @_Z13get_global_idj(i32 0)
327 %conv = and i64 %call, 255
328 %a0 = shl i64 %call, 7
329 %idx.ext11 = and i64 %a0, 4294934528
330 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
331 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
333 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
334 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
336 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
337 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
339 %add1 = add i64 %load2, %load1
341 %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
342 %load3 = load i64, i64 addrspace(1)* %addr3, align 8
344 %add2 = add i64 %load3, %add1
346 %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
347 %load4 = load i64, i64 addrspace(1)* %addr4, align 8
348 %add4 = add i64 %load4, %add2
350 store i64 %add4, i64 addrspace(1)* %saddr, align 8
354 ; TODO: Support load4 as anchor instruction.
355 define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) {
356 ; GCN-LABEL: p32Offset64:
357 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
358 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
359 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
360 ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
362 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
363 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
364 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
365 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
367 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
368 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
369 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
370 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
372 %call = tail call i64 @_Z13get_global_idj(i32 0)
373 %conv = and i64 %call, 255
374 %a0 = shl i64 %call, 7
375 %idx.ext11 = and i64 %a0, 4294934528
376 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
377 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
379 %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
380 %load1 = load i32, i32 addrspace(1)* %addr1, align 8
382 %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
383 %load2 = load i32, i32 addrspace(1)* %addr2, align 8
385 %add1 = add i32 %load2, %load1
387 %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
388 %load3 = load i32, i32 addrspace(1)* %addr3, align 8
390 %add2 = add i32 %load3, %add1
392 %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
393 %load4 = load i32, i32 addrspace(1)* %addr4, align 8
394 %add4 = add i32 %load4, %add2
396 store i32 %add4, i32 addrspace(1)* %saddr, align 8
400 define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
401 ; GCN-LABEL: DiffBase:
402 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
403 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
404 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
405 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
406 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
407 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
409 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
410 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
411 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
412 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
413 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
414 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
416 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
417 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
418 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
419 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
420 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
421 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
422 i8 addrspace(1)* %buffer2) {
424 %call = tail call i64 @_Z13get_global_idj(i32 0)
425 %conv = and i64 %call, 255
426 %a0 = shl i64 %call, 7
427 %idx.ext11 = and i64 %a0, 4294934528
428 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
429 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
431 %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
432 %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
434 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
435 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
436 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
437 %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
438 %add1 = add i64 %load2, %load1
439 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
440 %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
441 %add2 = add i64 %load3, %add1
443 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
444 %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
446 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
447 %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
448 %add3 = add i64 %load5, %load4
450 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
451 %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
452 %add4 = add i64 %load6, %add3
454 %add5 = add i64 %add2, %add4
456 store i64 %add5, i64 addrspace(1)* %saddr, align 8
460 define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
461 ; GCN-LABEL: ReverseOrder:
462 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
463 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
464 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
465 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
466 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
467 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
468 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
469 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
471 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
472 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
473 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
474 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
475 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
476 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
477 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
478 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
480 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
481 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
482 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
483 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
484 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
485 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
486 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
487 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
489 %call = tail call i64 @_Z13get_global_idj(i32 0)
490 %conv = and i64 %call, 255
491 %a0 = shl i64 %call, 7
492 %idx.ext11 = and i64 %a0, 4294934528
493 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
494 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
496 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
497 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
499 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
500 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
501 %add7 = add i64 %load8, %load1
503 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
504 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
505 %add6 = add i64 %load7, %add7
507 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
508 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
509 %add5 = add i64 %load6, %add6
511 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
512 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
513 %add4 = add i64 %load5, %add5
515 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
516 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
517 %add3 = add i64 %load4, %add4
519 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
520 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
521 %add2 = add i64 %load3, %add3
523 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
524 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
525 %add1 = add i64 %load2, %add2
527 store i64 %add1, i64 addrspace(1)* %saddr, align 8
531 define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
532 ; GCN-LABEL: negativeoffset:
533 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
534 ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
536 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
537 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
539 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
540 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
542 %call = tail call i64 @_Z13get_global_idj(i32 0) #2
543 %conv = and i64 %call, 255
544 %0 = shl i64 %call, 7
545 %idx.ext11 = and i64 %0, 4294934528
546 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
547 %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
549 %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
551 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
552 %load1 = load i64, i64 addrspace(1)* %addr1, align 8
554 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
555 %load2 = load i64, i64 addrspace(1)* %addr2, align 8
558 %add = add i64 %load2, %load1
560 store i64 %add, i64 addrspace(1)* %buffer_head, align 8