1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
5 ; Test splitting flat instruction offsets into the low and high bits
6 ; when the offset doesn't fit in the offset field.
8 define i8 @flat_inst_valu_offset_1(i8* %p) {
9 ; GFX9-LABEL: flat_inst_valu_offset_1:
11 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1
13 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14 ; GFX9-NEXT: s_setpc_b64 s[30:31]
16 ; GFX10-LABEL: flat_inst_valu_offset_1:
18 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
20 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 1
21 ; GFX10-NEXT: ; implicit-def: $vcc_hi
22 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
23 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
24 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
25 ; GFX10-NEXT: s_setpc_b64 s[30:31]
26 %gep = getelementptr i8, i8* %p, i64 1
27 %load = load i8, i8* %gep, align 4
31 define i8 @flat_inst_valu_offset_11bit_max(i8* %p) {
32 ; GFX9-LABEL: flat_inst_valu_offset_11bit_max:
34 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
36 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
37 ; GFX9-NEXT: s_setpc_b64 s[30:31]
39 ; GFX10-LABEL: flat_inst_valu_offset_11bit_max:
41 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
43 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
44 ; GFX10-NEXT: ; implicit-def: $vcc_hi
45 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
46 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
47 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
48 ; GFX10-NEXT: s_setpc_b64 s[30:31]
49 %gep = getelementptr i8, i8* %p, i64 2047
50 %load = load i8, i8* %gep, align 4
54 define i8 @flat_inst_valu_offset_12bit_max(i8* %p) {
55 ; GFX9-LABEL: flat_inst_valu_offset_12bit_max:
57 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
59 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
60 ; GFX9-NEXT: s_setpc_b64 s[30:31]
62 ; GFX10-LABEL: flat_inst_valu_offset_12bit_max:
64 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
66 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
67 ; GFX10-NEXT: ; implicit-def: $vcc_hi
68 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
69 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
70 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
71 ; GFX10-NEXT: s_setpc_b64 s[30:31]
72 %gep = getelementptr i8, i8* %p, i64 4095
73 %load = load i8, i8* %gep, align 4
77 define i8 @flat_inst_valu_offset_13bit_max(i8* %p) {
78 ; GFX9-LABEL: flat_inst_valu_offset_13bit_max:
80 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
82 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
83 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
84 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
85 ; GFX9-NEXT: s_setpc_b64 s[30:31]
87 ; GFX10-LABEL: flat_inst_valu_offset_13bit_max:
89 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
91 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
92 ; GFX10-NEXT: ; implicit-def: $vcc_hi
93 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
94 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
95 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
96 ; GFX10-NEXT: s_setpc_b64 s[30:31]
97 %gep = getelementptr i8, i8* %p, i64 8191
98 %load = load i8, i8* %gep, align 4
102 define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
103 ; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
105 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
107 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
108 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
109 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
110 ; GFX9-NEXT: s_setpc_b64 s[30:31]
112 ; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max:
114 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
116 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0
117 ; GFX10-NEXT: ; implicit-def: $vcc_hi
118 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
119 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
120 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
121 ; GFX10-NEXT: s_setpc_b64 s[30:31]
122 %gep = getelementptr i8, i8* %p, i64 -2048
123 %load = load i8, i8* %gep, align 4
127 define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) {
128 ; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
130 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
132 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
133 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
134 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
135 ; GFX9-NEXT: s_setpc_b64 s[30:31]
137 ; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max:
139 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
141 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
142 ; GFX10-NEXT: ; implicit-def: $vcc_hi
143 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
144 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
145 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
146 ; GFX10-NEXT: s_setpc_b64 s[30:31]
147 %gep = getelementptr i8, i8* %p, i64 -4096
148 %load = load i8, i8* %gep, align 4
152 define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) {
153 ; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
155 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
157 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
158 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
159 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
160 ; GFX9-NEXT: s_setpc_b64 s[30:31]
162 ; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max:
164 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
166 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
167 ; GFX10-NEXT: ; implicit-def: $vcc_hi
168 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
169 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
170 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
171 ; GFX10-NEXT: s_setpc_b64 s[30:31]
172 %gep = getelementptr i8, i8* %p, i64 -8192
173 %load = load i8, i8* %gep, align 4
177 define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) {
178 ; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
180 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
182 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
183 ; GFX9-NEXT: s_setpc_b64 s[30:31]
185 ; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max:
187 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
189 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
190 ; GFX10-NEXT: ; implicit-def: $vcc_hi
191 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
192 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
193 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
194 ; GFX10-NEXT: s_setpc_b64 s[30:31]
195 %gep = getelementptr i8, i8* %p, i64 4095
196 %load = load i8, i8* %gep, align 4
200 define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) {
201 ; GFX9-LABEL: flat_inst_valu_offset_2x_12bit_max:
203 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
205 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
206 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
207 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
208 ; GFX9-NEXT: s_setpc_b64 s[30:31]
210 ; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max:
212 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
214 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
215 ; GFX10-NEXT: ; implicit-def: $vcc_hi
216 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
217 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
218 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
219 ; GFX10-NEXT: s_setpc_b64 s[30:31]
220 %gep = getelementptr i8, i8* %p, i64 8191
221 %load = load i8, i8* %gep, align 4
225 define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) {
226 ; GFX9-LABEL: flat_inst_valu_offset_2x_13bit_max:
228 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0
230 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
231 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
232 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
233 ; GFX9-NEXT: s_setpc_b64 s[30:31]
235 ; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max:
237 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
239 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3fff, v0
240 ; GFX10-NEXT: ; implicit-def: $vcc_hi
241 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
242 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
243 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
244 ; GFX10-NEXT: s_setpc_b64 s[30:31]
245 %gep = getelementptr i8, i8* %p, i64 16383
246 %load = load i8, i8* %gep, align 4
250 define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) {
251 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
253 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
255 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
256 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
257 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
258 ; GFX9-NEXT: s_setpc_b64 s[30:31]
260 ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
262 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
264 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
265 ; GFX10-NEXT: ; implicit-def: $vcc_hi
266 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
267 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
268 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
269 ; GFX10-NEXT: s_setpc_b64 s[30:31]
270 %gep = getelementptr i8, i8* %p, i64 -4096
271 %load = load i8, i8* %gep, align 4
275 define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) {
276 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
278 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
280 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
281 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
282 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
283 ; GFX9-NEXT: s_setpc_b64 s[30:31]
285 ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
287 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
289 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
290 ; GFX10-NEXT: ; implicit-def: $vcc_hi
291 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
292 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
293 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
294 ; GFX10-NEXT: s_setpc_b64 s[30:31]
295 %gep = getelementptr i8, i8* %p, i64 -8192
296 %load = load i8, i8* %gep, align 4
300 define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) {
301 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
303 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
305 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
306 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
307 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
308 ; GFX9-NEXT: s_setpc_b64 s[30:31]
310 ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
312 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
313 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
314 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0
315 ; GFX10-NEXT: ; implicit-def: $vcc_hi
316 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
317 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
318 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
319 ; GFX10-NEXT: s_setpc_b64 s[30:31]
320 %gep = getelementptr i8, i8* %p, i64 -16384
321 %load = load i8, i8* %gep, align 4
325 ; Fill 11-bit low-bits (1ull << 33) | 2047
326 define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) {
327 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
329 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
331 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
332 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
333 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
334 ; GFX9-NEXT: s_setpc_b64 s[30:31]
336 ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
338 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
340 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
341 ; GFX10-NEXT: ; implicit-def: $vcc_hi
342 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
343 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
344 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
345 ; GFX10-NEXT: s_setpc_b64 s[30:31]
346 %gep = getelementptr i8, i8* %p, i64 8589936639
347 %load = load i8, i8* %gep, align 4
351 ; Fill 11-bit low-bits (1ull << 33) | 2048
352 define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) {
353 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
355 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
356 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
357 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
358 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
359 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
360 ; GFX9-NEXT: s_setpc_b64 s[30:31]
362 ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
364 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
366 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
367 ; GFX10-NEXT: ; implicit-def: $vcc_hi
368 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
369 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
370 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
371 ; GFX10-NEXT: s_setpc_b64 s[30:31]
372 %gep = getelementptr i8, i8* %p, i64 8589936640
373 %load = load i8, i8* %gep, align 4
377 ; Fill 12-bit low-bits (1ull << 33) | 4095
378 define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) {
379 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
381 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
383 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
384 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
385 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
386 ; GFX9-NEXT: s_setpc_b64 s[30:31]
388 ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
390 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
392 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
393 ; GFX10-NEXT: ; implicit-def: $vcc_hi
394 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
395 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
396 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
397 ; GFX10-NEXT: s_setpc_b64 s[30:31]
398 %gep = getelementptr i8, i8* %p, i64 8589938687
399 %load = load i8, i8* %gep, align 4
403 ; Fill 12-bit low-bits (1ull << 33) | 4096
404 define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) {
405 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
407 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
409 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
410 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
411 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
412 ; GFX9-NEXT: s_setpc_b64 s[30:31]
414 ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
416 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
418 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
419 ; GFX10-NEXT: ; implicit-def: $vcc_hi
420 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
421 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
422 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
423 ; GFX10-NEXT: s_setpc_b64 s[30:31]
424 %gep = getelementptr i8, i8* %p, i64 8589938688
425 %load = load i8, i8* %gep, align 4
429 ; Fill 13-bit low-bits (1ull << 33) | 8191
430 define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) {
431 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
433 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
435 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
436 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
437 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
438 ; GFX9-NEXT: s_setpc_b64 s[30:31]
440 ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
442 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
444 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
445 ; GFX10-NEXT: ; implicit-def: $vcc_hi
446 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
447 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
448 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
449 ; GFX10-NEXT: s_setpc_b64 s[30:31]
450 %gep = getelementptr i8, i8* %p, i64 8589942783
451 %load = load i8, i8* %gep, align 4
455 ; Fill 13-bit low-bits (1ull << 33) | 8192
456 define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) {
457 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
459 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
460 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
461 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
462 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
463 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
464 ; GFX9-NEXT: s_setpc_b64 s[30:31]
466 ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
468 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
469 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
470 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
471 ; GFX10-NEXT: ; implicit-def: $vcc_hi
472 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
473 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
474 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
475 ; GFX10-NEXT: s_setpc_b64 s[30:31]
476 %gep = getelementptr i8, i8* %p, i64 8589942784
477 %load = load i8, i8* %gep, align 4
481 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
482 define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
483 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
485 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
487 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
488 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
489 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
490 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
491 ; GFX9-NEXT: s_setpc_b64 s[30:31]
493 ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
495 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
496 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
497 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
498 ; GFX10-NEXT: ; implicit-def: $vcc_hi
499 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
500 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
501 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
502 ; GFX10-NEXT: s_setpc_b64 s[30:31]
503 %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
504 %load = load i8, i8* %gep, align 4
508 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
509 define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
510 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
512 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
513 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
514 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
515 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
516 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
517 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
518 ; GFX9-NEXT: s_setpc_b64 s[30:31]
520 ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
522 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
523 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
524 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
525 ; GFX10-NEXT: ; implicit-def: $vcc_hi
526 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
527 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
528 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
529 ; GFX10-NEXT: s_setpc_b64 s[30:31]
530 %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
531 %load = load i8, i8* %gep, align 4
535 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
536 define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
537 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
539 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
540 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
541 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
542 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
543 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
544 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
545 ; GFX9-NEXT: s_setpc_b64 s[30:31]
547 ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
549 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
551 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
552 ; GFX10-NEXT: ; implicit-def: $vcc_hi
553 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
554 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
555 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
556 ; GFX10-NEXT: s_setpc_b64 s[30:31]
557 %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
558 %load = load i8, i8* %gep, align 4
562 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
563 define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) {
564 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
566 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
568 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
569 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
570 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
571 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
572 ; GFX9-NEXT: s_setpc_b64 s[30:31]
574 ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
576 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
578 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
579 ; GFX10-NEXT: ; implicit-def: $vcc_hi
580 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
581 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
582 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
583 ; GFX10-NEXT: s_setpc_b64 s[30:31]
584 %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
585 %load = load i8, i8* %gep, align 4
589 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
590 define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
591 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
593 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
595 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
596 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
597 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
598 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
599 ; GFX9-NEXT: s_setpc_b64 s[30:31]
601 ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
603 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
605 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
606 ; GFX10-NEXT: ; implicit-def: $vcc_hi
607 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
608 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
609 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
610 ; GFX10-NEXT: s_setpc_b64 s[30:31]
611 %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
612 %load = load i8, i8* %gep, align 4
616 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
617 define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) {
618 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
620 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
621 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
622 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
623 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
624 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
625 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
626 ; GFX9-NEXT: s_setpc_b64 s[30:31]
628 ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
630 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
631 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
632 ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
633 ; GFX10-NEXT: ; implicit-def: $vcc_hi
634 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
635 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
636 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
637 ; GFX10-NEXT: s_setpc_b64 s[30:31]
638 %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
639 %load = load i8, i8* %gep, align 4
643 define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) {
644 ; GFX9-LABEL: flat_inst_salu_offset_1:
646 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
647 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
648 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
649 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
650 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1
651 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
652 ; GFX9-NEXT: flat_store_byte v[0:1], v0
653 ; GFX9-NEXT: s_endpgm
655 ; GFX10-LABEL: flat_inst_salu_offset_1:
657 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
658 ; GFX10-NEXT: ; implicit-def: $vcc_hi
659 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
660 ; GFX10-NEXT: s_add_u32 s0, s0, 1
661 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
662 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
663 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
664 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
665 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
666 ; GFX10-NEXT: flat_store_byte v[0:1], v0
667 ; GFX10-NEXT: s_endpgm
668 %gep = getelementptr i8, i8* %p, i64 1
669 %load = load volatile i8, i8* %gep, align 1
670 store i8 %load, i8* undef
674 define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) {
675 ; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
677 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
678 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
679 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
680 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
681 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
682 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
683 ; GFX9-NEXT: flat_store_byte v[0:1], v0
684 ; GFX9-NEXT: s_endpgm
686 ; GFX10-LABEL: flat_inst_salu_offset_11bit_max:
688 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
689 ; GFX10-NEXT: ; implicit-def: $vcc_hi
690 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
691 ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
692 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
693 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
694 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
695 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
696 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
697 ; GFX10-NEXT: flat_store_byte v[0:1], v0
698 ; GFX10-NEXT: s_endpgm
699 %gep = getelementptr i8, i8* %p, i64 2047
700 %load = load volatile i8, i8* %gep, align 1
701 store i8 %load, i8* undef
705 define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) {
706 ; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
708 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
709 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
710 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
711 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
712 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
713 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
714 ; GFX9-NEXT: flat_store_byte v[0:1], v0
715 ; GFX9-NEXT: s_endpgm
717 ; GFX10-LABEL: flat_inst_salu_offset_12bit_max:
719 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
720 ; GFX10-NEXT: ; implicit-def: $vcc_hi
721 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
722 ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
723 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
724 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
725 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
726 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
727 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
728 ; GFX10-NEXT: flat_store_byte v[0:1], v0
729 ; GFX10-NEXT: s_endpgm
730 %gep = getelementptr i8, i8* %p, i64 4095
731 %load = load volatile i8, i8* %gep, align 1
732 store i8 %load, i8* undef
736 define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) {
737 ; GFX9-LABEL: flat_inst_salu_offset_13bit_max:
739 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
740 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
742 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
743 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
744 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
745 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
746 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
747 ; GFX9-NEXT: flat_store_byte v[0:1], v0
748 ; GFX9-NEXT: s_endpgm
750 ; GFX10-LABEL: flat_inst_salu_offset_13bit_max:
752 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
753 ; GFX10-NEXT: ; implicit-def: $vcc_hi
754 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
755 ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
756 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
757 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
758 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
759 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
760 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
761 ; GFX10-NEXT: flat_store_byte v[0:1], v0
762 ; GFX10-NEXT: s_endpgm
763 %gep = getelementptr i8, i8* %p, i64 8191
764 %load = load volatile i8, i8* %gep, align 1
765 store i8 %load, i8* undef
769 define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) {
770 ; GFX9-LABEL: flat_inst_salu_offset_neg_11bit_max:
772 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
773 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
774 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
775 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
776 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
777 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
778 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
779 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
780 ; GFX9-NEXT: flat_store_byte v[0:1], v0
781 ; GFX9-NEXT: s_endpgm
783 ; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max:
785 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
786 ; GFX10-NEXT: ; implicit-def: $vcc_hi
787 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
788 ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800
789 ; GFX10-NEXT: s_addc_u32 s1, s1, -1
790 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
791 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
792 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
793 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
794 ; GFX10-NEXT: flat_store_byte v[0:1], v0
795 ; GFX10-NEXT: s_endpgm
796 %gep = getelementptr i8, i8* %p, i64 -2048
797 %load = load volatile i8, i8* %gep, align 1
798 store i8 %load, i8* undef
802 define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) {
803 ; GFX9-LABEL: flat_inst_salu_offset_neg_12bit_max:
805 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
806 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
807 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
808 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
809 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
810 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
811 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
812 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
813 ; GFX9-NEXT: flat_store_byte v[0:1], v0
814 ; GFX9-NEXT: s_endpgm
816 ; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max:
818 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
819 ; GFX10-NEXT: ; implicit-def: $vcc_hi
820 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
821 ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
822 ; GFX10-NEXT: s_addc_u32 s1, s1, -1
823 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
824 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
825 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
826 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
827 ; GFX10-NEXT: flat_store_byte v[0:1], v0
828 ; GFX10-NEXT: s_endpgm
829 %gep = getelementptr i8, i8* %p, i64 -4096
830 %load = load volatile i8, i8* %gep, align 1
831 store i8 %load, i8* undef
835 define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) {
836 ; GFX9-LABEL: flat_inst_salu_offset_neg_13bit_max:
838 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
839 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
840 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
841 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
842 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
843 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
844 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
845 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
846 ; GFX9-NEXT: flat_store_byte v[0:1], v0
847 ; GFX9-NEXT: s_endpgm
849 ; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max:
851 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
852 ; GFX10-NEXT: ; implicit-def: $vcc_hi
853 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
854 ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
855 ; GFX10-NEXT: s_addc_u32 s1, s1, -1
856 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
857 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
858 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
859 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
860 ; GFX10-NEXT: flat_store_byte v[0:1], v0
861 ; GFX10-NEXT: s_endpgm
862 %gep = getelementptr i8, i8* %p, i64 -8192
863 %load = load volatile i8, i8* %gep, align 1
864 store i8 %load, i8* undef
868 define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) {
869 ; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
871 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
872 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
874 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
875 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
876 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
877 ; GFX9-NEXT: flat_store_byte v[0:1], v0
878 ; GFX9-NEXT: s_endpgm
880 ; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max:
882 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
883 ; GFX10-NEXT: ; implicit-def: $vcc_hi
884 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
885 ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
886 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
887 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
888 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
889 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
890 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
891 ; GFX10-NEXT: flat_store_byte v[0:1], v0
892 ; GFX10-NEXT: s_endpgm
893 %gep = getelementptr i8, i8* %p, i64 4095
894 %load = load volatile i8, i8* %gep, align 1
895 store i8 %load, i8* undef
899 define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) {
900 ; GFX9-LABEL: flat_inst_salu_offset_2x_12bit_max:
902 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
903 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
904 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
905 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
906 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
907 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
908 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
909 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
910 ; GFX9-NEXT: flat_store_byte v[0:1], v0
911 ; GFX9-NEXT: s_endpgm
913 ; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max:
915 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
916 ; GFX10-NEXT: ; implicit-def: $vcc_hi
917 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
918 ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
919 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
920 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
921 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
922 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
923 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
924 ; GFX10-NEXT: flat_store_byte v[0:1], v0
925 ; GFX10-NEXT: s_endpgm
926 %gep = getelementptr i8, i8* %p, i64 8191
927 %load = load volatile i8, i8* %gep, align 1
928 store i8 %load, i8* undef
932 define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) {
933 ; GFX9-LABEL: flat_inst_salu_offset_2x_13bit_max:
935 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
936 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
937 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
938 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
939 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0
940 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
941 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
942 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
943 ; GFX9-NEXT: flat_store_byte v[0:1], v0
944 ; GFX9-NEXT: s_endpgm
946 ; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max:
948 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
949 ; GFX10-NEXT: ; implicit-def: $vcc_hi
950 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
951 ; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff
952 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
953 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
954 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
955 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
956 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
957 ; GFX10-NEXT: flat_store_byte v[0:1], v0
958 ; GFX10-NEXT: s_endpgm
959 %gep = getelementptr i8, i8* %p, i64 16383
960 %load = load volatile i8, i8* %gep, align 1
961 store i8 %load, i8* undef
965 define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) {
966 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
968 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
969 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
970 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
971 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
972 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
973 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
974 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
975 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
976 ; GFX9-NEXT: flat_store_byte v[0:1], v0
977 ; GFX9-NEXT: s_endpgm
979 ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
981 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
982 ; GFX10-NEXT: ; implicit-def: $vcc_hi
983 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
984 ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
985 ; GFX10-NEXT: s_addc_u32 s1, s1, -1
986 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
987 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
988 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
989 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
990 ; GFX10-NEXT: flat_store_byte v[0:1], v0
991 ; GFX10-NEXT: s_endpgm
992 %gep = getelementptr i8, i8* %p, i64 -4096
993 %load = load volatile i8, i8* %gep, align 1
994 store i8 %load, i8* undef
998 define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) {
999 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
1001 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1002 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1003 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1004 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1005 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
1006 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1007 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
1008 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1009 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1010 ; GFX9-NEXT: s_endpgm
1012 ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
1014 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1015 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1016 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1017 ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
1018 ; GFX10-NEXT: s_addc_u32 s1, s1, -1
1019 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1020 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1021 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1022 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1023 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1024 ; GFX10-NEXT: s_endpgm
1025 %gep = getelementptr i8, i8* %p, i64 -8192
1026 %load = load volatile i8, i8* %gep, align 1
1027 store i8 %load, i8* undef
1031 define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) {
1032 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
1034 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1035 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1036 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1037 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1038 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
1039 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1040 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
1041 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1042 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1043 ; GFX9-NEXT: s_endpgm
1045 ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
1047 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1048 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1049 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1050 ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000
1051 ; GFX10-NEXT: s_addc_u32 s1, s1, -1
1052 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1053 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1054 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1055 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1056 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1057 ; GFX10-NEXT: s_endpgm
1058 %gep = getelementptr i8, i8* %p, i64 -16384
1059 %load = load volatile i8, i8* %gep, align 1
1060 store i8 %load, i8* undef
1064 ; Fill 11-bit low-bits (1ull << 33) | 2047
1065 define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) {
1066 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
1068 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1069 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1070 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1071 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
1072 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1073 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
1074 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1075 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1076 ; GFX9-NEXT: s_endpgm
1078 ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
1080 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1081 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1082 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1083 ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
1084 ; GFX10-NEXT: s_addc_u32 s1, s1, 2
1085 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1086 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1087 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1088 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1089 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1090 ; GFX10-NEXT: s_endpgm
1091 %gep = getelementptr i8, i8* %p, i64 8589936639
1092 %load = load volatile i8, i8* %gep, align 1
1093 store i8 %load, i8* undef
1097 ; Fill 11-bit low-bits (1ull << 33) | 2048
1098 define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) {
1099 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
1101 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1102 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1103 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1104 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
1105 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1106 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
1107 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1108 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1109 ; GFX9-NEXT: s_endpgm
1111 ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
1113 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1114 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1115 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1116 ; GFX10-NEXT: s_add_u32 s0, s0, 0x800
1117 ; GFX10-NEXT: s_addc_u32 s1, s1, 2
1118 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1119 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1120 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1121 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1122 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1123 ; GFX10-NEXT: s_endpgm
1124 %gep = getelementptr i8, i8* %p, i64 8589936640
1125 %load = load volatile i8, i8* %gep, align 1
1126 store i8 %load, i8* undef
1130 ; Fill 12-bit low-bits (1ull << 33) | 4095
1131 define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) {
1132 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
1134 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1135 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1136 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1137 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
1138 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1139 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
1140 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1141 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1142 ; GFX9-NEXT: s_endpgm
1144 ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
1146 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1147 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1148 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1149 ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
1150 ; GFX10-NEXT: s_addc_u32 s1, s1, 2
1151 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1152 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1153 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1154 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1155 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1156 ; GFX10-NEXT: s_endpgm
1157 %gep = getelementptr i8, i8* %p, i64 8589938687
1158 %load = load volatile i8, i8* %gep, align 1
1159 store i8 %load, i8* undef
1163 ; Fill 12-bit low-bits (1ull << 33) | 4096
1164 define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) {
1165 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
1167 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1168 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1170 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1171 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
1172 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1173 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
1174 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1175 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1176 ; GFX9-NEXT: s_endpgm
1178 ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
1180 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1181 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1182 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1183 ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
1184 ; GFX10-NEXT: s_addc_u32 s1, s1, 2
1185 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1186 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1187 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1188 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1189 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1190 ; GFX10-NEXT: s_endpgm
1191 %gep = getelementptr i8, i8* %p, i64 8589938688
1192 %load = load volatile i8, i8* %gep, align 1
1193 store i8 %load, i8* undef
1197 ; Fill 13-bit low-bits (1ull << 33) | 8191
1198 define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) {
1199 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
1201 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1202 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1203 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1204 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1205 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
1206 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1207 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
1208 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1209 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1210 ; GFX9-NEXT: s_endpgm
1212 ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
1214 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1215 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1216 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1217 ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
1218 ; GFX10-NEXT: s_addc_u32 s1, s1, 2
1219 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1220 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1221 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1222 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1223 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1224 ; GFX10-NEXT: s_endpgm
1225 %gep = getelementptr i8, i8* %p, i64 8589942783
1226 %load = load volatile i8, i8* %gep, align 1
1227 store i8 %load, i8* undef
1231 ; Fill 13-bit low-bits (1ull << 33) | 8192
1232 define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) {
1233 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
1235 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1236 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1238 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1239 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
1240 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1241 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
1242 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1243 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1244 ; GFX9-NEXT: s_endpgm
1246 ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
1248 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1249 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1250 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1251 ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
1252 ; GFX10-NEXT: s_addc_u32 s1, s1, 2
1253 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1254 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1255 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1256 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1257 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1258 ; GFX10-NEXT: s_endpgm
1259 %gep = getelementptr i8, i8* %p, i64 8589942784
1260 %load = load volatile i8, i8* %gep, align 1
1261 store i8 %load, i8* undef
1265 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
1266 define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) {
1267 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
1269 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1270 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
1271 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1272 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1273 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
1274 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1275 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047
1276 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1277 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1278 ; GFX9-NEXT: s_endpgm
1280 ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
1282 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1283 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1284 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1285 ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
1286 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
1287 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1288 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1289 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1290 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1291 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1292 ; GFX10-NEXT: s_endpgm
1293 %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
1294 %load = load volatile i8, i8* %gep, align 1
1295 store i8 %load, i8* undef
1299 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
1300 define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) {
1301 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
1303 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1304 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
1305 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1306 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1307 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
1308 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1309 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048
1310 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1311 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1312 ; GFX9-NEXT: s_endpgm
1314 ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
1316 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1317 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1318 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1319 ; GFX10-NEXT: s_add_u32 s0, s0, 0x800
1320 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
1321 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1322 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1323 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1324 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1325 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1326 ; GFX10-NEXT: s_endpgm
1327 %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
1328 %load = load volatile i8, i8* %gep, align 1
1329 store i8 %load, i8* undef
1333 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
1334 define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) {
1335 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
1337 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1338 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
1339 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1340 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1341 ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
1342 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1343 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
1344 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1345 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1346 ; GFX9-NEXT: s_endpgm
1348 ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
1350 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1351 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1352 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1353 ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
1354 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
1355 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1356 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1357 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1358 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1359 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1360 ; GFX10-NEXT: s_endpgm
1361 %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
1362 %load = load volatile i8, i8* %gep, align 1
1363 store i8 %load, i8* undef
1367 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
1368 define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) {
1369 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
1371 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1372 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
1373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1374 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1375 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1376 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
1377 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1378 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
1379 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1380 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1381 ; GFX9-NEXT: s_endpgm
1383 ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
1385 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1386 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1387 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1388 ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
1389 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
1390 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1391 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1392 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1393 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1394 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1395 ; GFX10-NEXT: s_endpgm
1396 %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
1397 %load = load volatile i8, i8* %gep, align 1
1398 store i8 %load, i8* undef
1402 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
1403 define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) {
1404 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
1406 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1407 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
1408 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1409 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1410 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1411 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
1412 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1413 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095
1414 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1415 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1416 ; GFX9-NEXT: s_endpgm
1418 ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
1420 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1421 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1422 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1423 ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
1424 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
1425 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1426 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1427 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1428 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1429 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1430 ; GFX10-NEXT: s_endpgm
1431 %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
1432 %load = load volatile i8, i8* %gep, align 1
1433 store i8 %load, i8* undef
1437 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
1438 define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) {
1439 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
1441 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1442 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
1443 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1444 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1445 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1446 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
1447 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1448 ; GFX9-NEXT: flat_load_ubyte v0, v[0:1]
1449 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1450 ; GFX9-NEXT: flat_store_byte v[0:1], v0
1451 ; GFX9-NEXT: s_endpgm
1453 ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
1455 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1456 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1457 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1458 ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
1459 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
1460 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1461 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1462 ; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
1463 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1464 ; GFX10-NEXT: flat_store_byte v[0:1], v0
1465 ; GFX10-NEXT: s_endpgm
1466 %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
1467 %load = load volatile i8, i8* %gep, align 1
1468 store i8 %load, i8* undef