1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s
4 ; This test is mostly to test DAG store merging, so disable the vectorizer.
5 ; Run with devices with different unaligned load restrictions.
7 ; TODO: Vector element tests
8 ; TODO: Non-zero base offset for load and store combinations
9 ; TODO: Same base addrspacecasted
12 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
13 ; GCN: buffer_store_short
15 define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
16 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
18 store i8 123, ptr addrspace(1) %out.gep.1
19 store i8 456, ptr addrspace(1) %out, align 2
23 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24 ; GCN: buffer_store_byte
25 ; GCN: buffer_store_byte
27 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
28 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
30 store i8 123, ptr addrspace(1) %out.gep.1
31 store i8 456, ptr addrspace(1) %out
35 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36 ; GCN: buffer_store_dword v
37 define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
38 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
40 store i16 123, ptr addrspace(1) %out.gep.1
41 store i16 456, ptr addrspace(1) %out, align 4
45 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46 ; GCN: buffer_store_dword v
47 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
48 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
50 store i16 0, ptr addrspace(1) %out.gep.1
51 store i16 0, ptr addrspace(1) %out, align 4
55 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56 ; GCN: buffer_store_short
57 ; GCN: buffer_store_short
59 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
60 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
62 store i16 123, ptr addrspace(1) %out.gep.1
63 store i16 456, ptr addrspace(1) %out
67 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
69 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
70 ; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
71 define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
72 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
74 store i32 123, ptr addrspace(1) %out.gep.1
75 store i32 456, ptr addrspace(1) %out
79 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
80 ; GCN: buffer_store_dwordx2
81 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
82 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
83 store float 1.0, ptr addrspace(1) %out.gep.1
84 store i32 456, ptr addrspace(1) %out
88 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
89 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
90 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
91 ; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
92 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
93 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
94 store i32 123, ptr addrspace(1) %out.gep.1
95 store float 4.0, ptr addrspace(1) %out
99 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
100 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
101 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
102 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
103 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
104 ; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]]
105 define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
106 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
107 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
108 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
110 store i32 123, ptr addrspace(1) %out.gep.1
111 store i32 456, ptr addrspace(1) %out.gep.2
112 store i32 333, ptr addrspace(1) %out.gep.3
113 store i32 1234, ptr addrspace(1) %out
117 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
118 ; GCN: buffer_store_dwordx4
119 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
120 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
121 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
122 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
124 store float 8.0, ptr addrspace(1) %out
125 store float 1.0, ptr addrspace(1) %out.gep.1
126 store float 2.0, ptr addrspace(1) %out.gep.2
127 store float 4.0, ptr addrspace(1) %out.gep.3
131 ; First store is out of order.
132 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
133 ; GCN: buffer_store_dwordx4
134 define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
135 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
136 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
137 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
139 store float 1.0, ptr addrspace(1) %out.gep.1
140 store float 2.0, ptr addrspace(1) %out.gep.2
141 store float 4.0, ptr addrspace(1) %out.gep.3
142 store float 8.0, ptr addrspace(1) %out
146 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
147 ; GCN-AA: buffer_store_dwordx4 v
149 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
150 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
151 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
152 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
155 store i32 11, ptr addrspace(1) %out.gep.1
156 store float 2.0, ptr addrspace(1) %out.gep.2
157 store i32 17, ptr addrspace(1) %out.gep.3
158 store float 8.0, ptr addrspace(1) %out
162 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
163 ; SI-DAG: buffer_store_dwordx2
164 ; SI-DAG: buffer_store_dword v
165 ; CI-DAG: buffer_store_dwordx3
166 ; GCN-NOT: buffer_store_dword
168 define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
169 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
170 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
172 store i32 123, ptr addrspace(1) %out.gep.1
173 store i32 456, ptr addrspace(1) %out.gep.2
174 store i32 1234, ptr addrspace(1) %out
178 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
179 ; GCN: buffer_store_dwordx4
180 define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
181 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
183 store i64 123, ptr addrspace(1) %out.gep.1
184 store i64 456, ptr addrspace(1) %out
188 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
189 ; GCN: buffer_store_dwordx4
190 ; GCN: buffer_store_dwordx4
191 define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
192 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
193 %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
194 %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
196 store i64 123, ptr addrspace(1) %out.gep.1
197 store i64 456, ptr addrspace(1) %out.gep.2
198 store i64 333, ptr addrspace(1) %out.gep.3
199 store i64 1234, ptr addrspace(1) %out
203 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
204 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
205 ; GCN: buffer_store_dwordx2 [[LOAD]]
206 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
207 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
208 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
210 %lo = load i32, ptr addrspace(1) %in
211 %hi = load i32, ptr addrspace(1) %in.gep.1
213 store i32 %lo, ptr addrspace(1) %out
214 store i32 %hi, ptr addrspace(1) %out.gep.1
218 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
219 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
220 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
221 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
222 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
223 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
225 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2
226 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3
227 %lo = load i32, ptr addrspace(1) %in.gep.0
228 %hi = load i32, ptr addrspace(1) %in.gep.1
230 store i32 %lo, ptr addrspace(1) %out.gep.0
231 store i32 %hi, ptr addrspace(1) %out.gep.1
235 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
236 ; GCN: buffer_load_dwordx2 v
237 ; GCN: buffer_store_dwordx2 v
238 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
239 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
240 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
242 %lo = load i32, ptr addrspace(1) %in
243 %hi = load i32, ptr addrspace(1) %in.gep.1
245 store i32 %hi, ptr addrspace(1) %out
246 store i32 %lo, ptr addrspace(1) %out.gep.1
250 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
251 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
252 ; GCN: buffer_store_dwordx4 [[LOAD]]
253 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
254 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
255 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
256 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
257 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
258 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
259 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
261 %x = load i32, ptr addrspace(1) %in
262 %y = load i32, ptr addrspace(1) %in.gep.1
263 %z = load i32, ptr addrspace(1) %in.gep.2
264 %w = load i32, ptr addrspace(1) %in.gep.3
266 store i32 %x, ptr addrspace(1) %out
267 store i32 %y, ptr addrspace(1) %out.gep.1
268 store i32 %z, ptr addrspace(1) %out.gep.2
269 store i32 %w, ptr addrspace(1) %out.gep.3
273 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
274 ; SI-DAG: buffer_load_dwordx2
275 ; SI-DAG: buffer_load_dword
276 ; CI-DAG: buffer_load_dwordx3
278 ; SI-DAG: buffer_store_dwordx2
279 ; SI-DAG: buffer_store_dword v
280 ; CI-DAG: buffer_store_dwordx3
282 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
283 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
284 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
285 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
286 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
288 %x = load i32, ptr addrspace(1) %in
289 %y = load i32, ptr addrspace(1) %in.gep.1
290 %z = load i32, ptr addrspace(1) %in.gep.2
292 store i32 %x, ptr addrspace(1) %out
293 store i32 %y, ptr addrspace(1) %out.gep.1
294 store i32 %z, ptr addrspace(1) %out.gep.2
298 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
299 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
300 ; GCN: buffer_store_dwordx4 [[LOAD]]
301 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
302 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
303 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
304 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
305 %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1
306 %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2
307 %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3
309 %x = load float, ptr addrspace(1) %in
310 %y = load float, ptr addrspace(1) %in.gep.1
311 %z = load float, ptr addrspace(1) %in.gep.2
312 %w = load float, ptr addrspace(1) %in.gep.3
314 store float %x, ptr addrspace(1) %out
315 store float %y, ptr addrspace(1) %out.gep.1
316 store float %z, ptr addrspace(1) %out.gep.2
317 store float %w, ptr addrspace(1) %out.gep.3
321 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
322 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
323 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
324 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
325 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
326 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
327 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
328 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14
329 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7
330 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8
331 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9
332 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10
334 %x = load i32, ptr addrspace(1) %in.gep.0
335 %y = load i32, ptr addrspace(1) %in.gep.1
336 %z = load i32, ptr addrspace(1) %in.gep.2
337 %w = load i32, ptr addrspace(1) %in.gep.3
339 store i32 %x, ptr addrspace(1) %out.gep.0
340 store i32 %y, ptr addrspace(1) %out.gep.1
341 store i32 %z, ptr addrspace(1) %out.gep.2
342 store i32 %w, ptr addrspace(1) %out.gep.3
346 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
347 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
349 ; GCN: buffer_store_dwordx4 [[LOAD]]
350 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
351 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
352 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
353 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
354 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
355 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
356 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
358 %x = load i32, ptr addrspace(1) %in
359 %y = load i32, ptr addrspace(1) %in.gep.1
360 %z = load i32, ptr addrspace(1) %in.gep.2
361 %w = load i32, ptr addrspace(1) %in.gep.3
363 ; Make sure the barrier doesn't stop this
364 tail call void @llvm.amdgcn.s.barrier() #1
366 store i32 %w, ptr addrspace(1) %out.gep.3
367 store i32 %z, ptr addrspace(1) %out.gep.2
368 store i32 %y, ptr addrspace(1) %out.gep.1
369 store i32 %x, ptr addrspace(1) %out
374 ; TODO: Re-packing of loaded register required. Maybe an IR pass
377 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
378 ; GCN: buffer_load_dwordx4 v
380 ; GCN: buffer_store_dwordx4 v
381 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
382 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
383 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
384 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
385 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
386 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
387 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
389 %x = load i32, ptr addrspace(1) %in
390 %y = load i32, ptr addrspace(1) %in.gep.1
391 %z = load i32, ptr addrspace(1) %in.gep.2
392 %w = load i32, ptr addrspace(1) %in.gep.3
394 ; Make sure the barrier doesn't stop this
395 tail call void @llvm.amdgcn.s.barrier() #1
397 store i32 %w, ptr addrspace(1) %out
398 store i32 %z, ptr addrspace(1) %out.gep.1
399 store i32 %y, ptr addrspace(1) %out.gep.2
400 store i32 %x, ptr addrspace(1) %out.gep.3
405 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
406 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
407 ; GCN: buffer_store_dword [[LOAD]]
409 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
410 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
411 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
412 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
413 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
414 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
415 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
417 %x = load i8, ptr addrspace(1) %in, align 4
418 %y = load i8, ptr addrspace(1) %in.gep.1
419 %z = load i8, ptr addrspace(1) %in.gep.2
420 %w = load i8, ptr addrspace(1) %in.gep.3
422 store i8 %x, ptr addrspace(1) %out, align 4
423 store i8 %y, ptr addrspace(1) %out.gep.1
424 store i8 %z, ptr addrspace(1) %out.gep.2
425 store i8 %w, ptr addrspace(1) %out.gep.3
429 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
430 ; GCN: buffer_load_ubyte
431 ; GCN: buffer_load_ubyte
432 ; GCN: buffer_load_ubyte
433 ; GCN: buffer_load_ubyte
434 ; GCN: buffer_store_byte
435 ; GCN: buffer_store_byte
436 ; GCN: buffer_store_byte
437 ; GCN: buffer_store_byte
439 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
440 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
441 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
442 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
443 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
444 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
445 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
447 %x = load i8, ptr addrspace(1) %in
448 %y = load i8, ptr addrspace(1) %in.gep.1
449 %z = load i8, ptr addrspace(1) %in.gep.2
450 %w = load i8, ptr addrspace(1) %in.gep.3
452 store i8 %x, ptr addrspace(1) %out
453 store i8 %y, ptr addrspace(1) %out.gep.1
454 store i8 %z, ptr addrspace(1) %out.gep.2
455 store i8 %w, ptr addrspace(1) %out.gep.3
459 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
460 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
461 ; GCN: buffer_store_dwordx4 [[LOAD]]
463 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
464 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
465 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
466 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
467 %vec = load <4 x i32>, ptr addrspace(1) %in
469 %x = extractelement <4 x i32> %vec, i32 0
470 %y = extractelement <4 x i32> %vec, i32 1
471 %z = extractelement <4 x i32> %vec, i32 2
472 %w = extractelement <4 x i32> %vec, i32 3
474 store i32 %x, ptr addrspace(1) %out
475 store i32 %y, ptr addrspace(1) %out.gep.1
476 store i32 %z, ptr addrspace(1) %out.gep.2
477 store i32 %w, ptr addrspace(1) %out.gep.3
481 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
484 define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
485 %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
487 store i8 123, ptr addrspace(3) %out.gep.1
488 store i8 456, ptr addrspace(3) %out, align 2
492 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
493 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
494 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
495 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
496 define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
497 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
499 store i32 123, ptr addrspace(3) %out.gep.1
500 store i32 456, ptr addrspace(3) %out
504 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
505 ; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
506 ; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
507 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
509 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
510 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
511 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
514 define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
515 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
516 %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
517 %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
519 store i32 123, ptr addrspace(3) %out.gep.1
520 store i32 456, ptr addrspace(3) %out.gep.2
521 store i32 333, ptr addrspace(3) %out.gep.3
522 store i32 1234, ptr addrspace(3) %out
526 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
527 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
528 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
529 ; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]]
530 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
531 ; GCN: buffer_store_dword v[[HI]]
532 define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
533 store i32 9, ptr addrspace(1) %out, align 4
534 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
535 store i32 12, ptr addrspace(1) %idx1, align 4
536 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
537 store i32 16, ptr addrspace(1) %idx2, align 4
538 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
539 store i32 -12, ptr addrspace(1) %idx3, align 4
540 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
541 store i32 11, ptr addrspace(1) %idx4, align 4
545 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
546 ; GCN: buffer_store_dwordx4
547 ; GCN: buffer_store_dwordx2
548 define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
549 store i32 13, ptr addrspace(1) %out, align 4
550 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
551 store i32 15, ptr addrspace(1) %idx1, align 4
552 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
553 store i32 62, ptr addrspace(1) %idx2, align 4
554 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
555 store i32 63, ptr addrspace(1) %idx3, align 4
556 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
557 store i32 11, ptr addrspace(1) %idx4, align 4
558 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
559 store i32 123, ptr addrspace(1) %idx5, align 4
563 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
564 ; GCN: buffer_store_dwordx4
565 ; SI-DAG: buffer_store_dwordx2
566 ; CI: buffer_store_dwordx3
567 define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
568 store i32 34, ptr addrspace(1) %out, align 4
569 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
570 store i32 999, ptr addrspace(1) %idx1, align 4
571 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
572 store i32 65, ptr addrspace(1) %idx2, align 4
573 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
574 store i32 33, ptr addrspace(1) %idx3, align 4
575 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
576 store i32 98, ptr addrspace(1) %idx4, align 4
577 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
578 store i32 91, ptr addrspace(1) %idx5, align 4
579 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
580 store i32 212, ptr addrspace(1) %idx6, align 4
584 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
585 ; GCN: buffer_store_dwordx4
586 ; GCN: buffer_store_dwordx4
588 define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
589 store i32 34, ptr addrspace(1) %out, align 4
590 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
591 store i32 999, ptr addrspace(1) %idx1, align 4
592 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
593 store i32 65, ptr addrspace(1) %idx2, align 4
594 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
595 store i32 33, ptr addrspace(1) %idx3, align 4
596 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
597 store i32 98, ptr addrspace(1) %idx4, align 4
598 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
599 store i32 91, ptr addrspace(1) %idx5, align 4
600 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
601 store i32 212, ptr addrspace(1) %idx6, align 4
602 %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7
603 store i32 999, ptr addrspace(1) %idx7, align 4
607 ; This requires handling of scalar_to_vector for v2i64 to avoid
609 ; FIXME: Should do single load and store
611 ; GCN-LABEL: {{^}}copy_v3i32_align4:
612 ; GCN-NOT: SCRATCH_RSRC_DWORD
613 ; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
614 ; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
615 ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
617 ; GCN: s_waitcnt vmcnt
619 ; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
620 ; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
621 ; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
623 ; GCN: ScratchSize: 0{{$}}
624 define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
625 %vec = load <3 x i32>, ptr addrspace(1) %in, align 4
626 store <3 x i32> %vec, ptr addrspace(1) %out
630 ; GCN-LABEL: {{^}}copy_v3i64_align4:
631 ; GCN-NOT: SCRATCH_RSRC_DWORD
632 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
633 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
635 ; GCN: s_waitcnt vmcnt
637 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
638 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
639 ; GCN: ScratchSize: 0{{$}}
640 define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
641 %vec = load <3 x i64>, ptr addrspace(1) %in, align 4
642 store <3 x i64> %vec, ptr addrspace(1) %out
646 ; GCN-LABEL: {{^}}copy_v3f32_align4:
647 ; GCN-NOT: SCRATCH_RSRC_DWORD
648 ; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
649 ; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
650 ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
652 ; GCN: s_waitcnt vmcnt
654 ; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
655 ; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
656 ; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
657 ; GCN: ScratchSize: 0{{$}}
658 define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
659 %vec = load <3 x float>, ptr addrspace(1) %in, align 4
660 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
661 store <3 x float> %fadd, ptr addrspace(1) %out
665 ; GCN-LABEL: {{^}}copy_v3f64_align4:
666 ; GCN-NOT: SCRATCH_RSRC_DWORD
667 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
668 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
670 ; GCN: s_waitcnt vmcnt
672 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
673 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
674 ; GCN: ScratchSize: 0{{$}}
675 define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
676 %vec = load <3 x double>, ptr addrspace(1) %in, align 4
677 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
678 store <3 x double> %fadd, ptr addrspace(1) %out
682 declare void @llvm.amdgcn.s.barrier() #1
684 attributes #0 = { nounwind }
685 attributes #1 = { convergent nounwind }