1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck %s
2 ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
4 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
6 ; TODO: Vector element tests
7 ; TODO: Non-zero base offset for load and store combinations
8 ; TODO: Same base addrspacecasted
11 define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
12 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
13 ; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 2
14 ; CHECK-NEXT: ret void
16 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
18 store i8 123, ptr addrspace(1) %out.gep.1
19 store i8 456, ptr addrspace(1) %out, align 2
23 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
24 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align(
25 ; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 1
26 ; CHECK-NEXT: ret void
28 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
30 store i8 123, ptr addrspace(1) %out.gep.1
31 store i8 456, ptr addrspace(1) %out
35 define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
36 ; CHECK-LABEL: @merge_global_store_2_constants_i16(
37 ; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 4
38 ; CHECK-NEXT: ret void
40 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
42 store i16 123, ptr addrspace(1) %out.gep.1
43 store i16 456, ptr addrspace(1) %out, align 4
47 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
48 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16(
49 ; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(1) [[OUT:%.*]], align 4
50 ; CHECK-NEXT: ret void
52 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
54 store i16 0, ptr addrspace(1) %out.gep.1
55 store i16 0, ptr addrspace(1) %out, align 4
59 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
60 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align(
61 ; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 2
62 ; CHECK-NEXT: ret void
64 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
66 store i16 123, ptr addrspace(1) %out.gep.1
67 store i16 456, ptr addrspace(1) %out
71 define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(ptr addrspace(1) %out) #0 {
72 ; CHECK-LABEL: @merge_global_store_2_constants_i16_align_1(
73 ; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 1
74 ; CHECK-NEXT: ret void
76 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
78 store i16 123, ptr addrspace(1) %out.gep.1, align 1
79 store i16 456, ptr addrspace(1) %out, align 1
83 define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(ptr addrspace(1) %out) #0 {
84 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align(
85 ; CHECK-NEXT: store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 2
86 ; CHECK-NEXT: ret void
88 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
90 store half 2.0, ptr addrspace(1) %out.gep.1
91 store half 1.0, ptr addrspace(1) %out
95 define amdgpu_kernel void @merge_global_store_2_constants_half_align_1(ptr addrspace(1) %out) #0 {
96 ; CHECK-LABEL: @merge_global_store_2_constants_half_align_1(
97 ; CHECK-NEXT: store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 1
98 ; CHECK-NEXT: ret void
100 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
102 store half 2.0, ptr addrspace(1) %out.gep.1, align 1
103 store half 1.0, ptr addrspace(1) %out, align 1
107 define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
108 ; CHECK-LABEL: @merge_global_store_2_constants_i32(
109 ; CHECK-NEXT: store <2 x i32> <i32 456, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4
110 ; CHECK-NEXT: ret void
112 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
114 store i32 123, ptr addrspace(1) %out.gep.1
115 store i32 456, ptr addrspace(1) %out
119 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
120 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32(
121 ; CHECK-NEXT: store <2 x i32> <i32 456, i32 1065353216>, ptr addrspace(1) [[OUT:%.*]], align 4
122 ; CHECK-NEXT: ret void
124 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
125 store float 1.0, ptr addrspace(1) %out.gep.1
126 store i32 456, ptr addrspace(1) %out
130 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
131 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32(
132 ; CHECK-NEXT: store <2 x i32> <i32 1082130432, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4
133 ; CHECK-NEXT: ret void
135 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
136 store i32 123, ptr addrspace(1) %out.gep.1
137 store float 4.0, ptr addrspace(1) %out
141 define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
142 ; CHECK-LABEL: @merge_global_store_4_constants_i32(
143 ; CHECK-NEXT: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, ptr addrspace(1) [[OUT:%.*]], align 4
144 ; CHECK-NEXT: ret void
146 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
147 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
148 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
150 store i32 123, ptr addrspace(1) %out.gep.1
151 store i32 456, ptr addrspace(1) %out.gep.2
152 store i32 333, ptr addrspace(1) %out.gep.3
153 store i32 1234, ptr addrspace(1) %out
157 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
158 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order(
159 ; CHECK-NEXT: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4
160 ; CHECK-NEXT: ret void
162 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
163 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
164 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
166 store float 8.0, ptr addrspace(1) %out
167 store float 1.0, ptr addrspace(1) %out.gep.1
168 store float 2.0, ptr addrspace(1) %out.gep.2
169 store float 4.0, ptr addrspace(1) %out.gep.3
173 ; First store is out of order.
174 define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
175 ; CHECK-LABEL: @merge_global_store_4_constants_f32(
176 ; CHECK-NEXT: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4
177 ; CHECK-NEXT: ret void
179 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
180 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
181 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
183 store float 1.0, ptr addrspace(1) %out.gep.1
184 store float 2.0, ptr addrspace(1) %out.gep.2
185 store float 4.0, ptr addrspace(1) %out.gep.3
186 store float 8.0, ptr addrspace(1) %out
190 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
191 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32(
192 ; CHECK-NEXT: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, ptr addrspace(1) [[OUT:%.*]], align 4
193 ; CHECK-NEXT: ret void
195 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
196 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
197 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
200 store i32 11, ptr addrspace(1) %out.gep.1
201 store float 2.0, ptr addrspace(1) %out.gep.2
202 store i32 17, ptr addrspace(1) %out.gep.3
203 store float 8.0, ptr addrspace(1) %out
207 define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
208 ; CHECK-LABEL: @merge_global_store_3_constants_i32(
209 ; CHECK-NEXT: store <3 x i32> <i32 1234, i32 123, i32 456>, ptr addrspace(1) [[OUT:%.*]], align 4
210 ; CHECK-NEXT: ret void
212 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
213 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
215 store i32 123, ptr addrspace(1) %out.gep.1
216 store i32 456, ptr addrspace(1) %out.gep.2
217 store i32 1234, ptr addrspace(1) %out
221 define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
222 ; CHECK-LABEL: @merge_global_store_2_constants_i64(
223 ; CHECK-NEXT: store <2 x i64> <i64 456, i64 123>, ptr addrspace(1) [[OUT:%.*]], align 8
224 ; CHECK-NEXT: ret void
226 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
228 store i64 123, ptr addrspace(1) %out.gep.1
229 store i64 456, ptr addrspace(1) %out
233 define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
234 ; CHECK-LABEL: @merge_global_store_4_constants_i64(
235 ; CHECK-NEXT: [[OUT_GEP_2:%.*]] = getelementptr i64, ptr addrspace(1) [[OUT:%.*]], i64 2
236 ; CHECK-NEXT: store <2 x i64> <i64 456, i64 333>, ptr addrspace(1) [[OUT_GEP_2]], align 8
237 ; CHECK-NEXT: store <2 x i64> <i64 1234, i64 123>, ptr addrspace(1) [[OUT]], align 8
238 ; CHECK-NEXT: ret void
240 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
241 %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
242 %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
244 store i64 123, ptr addrspace(1) %out.gep.1
245 store i64 456, ptr addrspace(1) %out.gep.2
246 store i64 333, ptr addrspace(1) %out.gep.3
247 store i64 1234, ptr addrspace(1) %out
251 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
252 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32(
253 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
254 ; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
255 ; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
256 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
257 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1
258 ; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
259 ; CHECK-NEXT: ret void
261 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
262 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
264 %lo = load i32, ptr addrspace(1) %in
265 %hi = load i32, ptr addrspace(1) %in.gep.1
267 store i32 %lo, ptr addrspace(1) %out
268 store i32 %hi, ptr addrspace(1) %out.gep.1
272 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
273 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base(
274 ; CHECK-NEXT: [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 2
275 ; CHECK-NEXT: [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 2
276 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4
277 ; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
278 ; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
279 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
280 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1
281 ; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT_GEP_0]], align 4
282 ; CHECK-NEXT: ret void
284 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
285 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
287 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2
288 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3
289 %lo = load i32, ptr addrspace(1) %in.gep.0
290 %hi = load i32, ptr addrspace(1) %in.gep.1
292 store i32 %lo, ptr addrspace(1) %out.gep.0
293 store i32 %hi, ptr addrspace(1) %out.gep.1
297 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
298 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32(
299 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
300 ; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
301 ; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
302 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[HI2]], i32 0
303 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LO1]], i32 1
304 ; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
305 ; CHECK-NEXT: ret void
307 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
308 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
310 %lo = load i32, ptr addrspace(1) %in
311 %hi = load i32, ptr addrspace(1) %in.gep.1
313 store i32 %hi, ptr addrspace(1) %out
314 store i32 %lo, ptr addrspace(1) %out.gep.1
318 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
319 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32(
320 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
321 ; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
322 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
323 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
324 ; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
325 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
326 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
327 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
328 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
329 ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
330 ; CHECK-NEXT: ret void
332 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
333 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
334 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
335 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
336 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
337 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
339 %x = load i32, ptr addrspace(1) %in
340 %y = load i32, ptr addrspace(1) %in.gep.1
341 %z = load i32, ptr addrspace(1) %in.gep.2
342 %w = load i32, ptr addrspace(1) %in.gep.3
344 store i32 %x, ptr addrspace(1) %out
345 store i32 %y, ptr addrspace(1) %out.gep.1
346 store i32 %z, ptr addrspace(1) %out.gep.2
347 store i32 %w, ptr addrspace(1) %out.gep.3
351 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
352 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32(
353 ; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
354 ; CHECK-NEXT: [[X1:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0
355 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 1
356 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <3 x i32> [[TMP1]], i32 2
357 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[X1]], i32 0
358 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[Y2]], i32 1
359 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[Z3]], i32 2
360 ; CHECK-NEXT: store <3 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
361 ; CHECK-NEXT: ret void
363 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
364 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
365 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
366 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
368 %x = load i32, ptr addrspace(1) %in
369 %y = load i32, ptr addrspace(1) %in.gep.1
370 %z = load i32, ptr addrspace(1) %in.gep.2
372 store i32 %x, ptr addrspace(1) %out
373 store i32 %y, ptr addrspace(1) %out.gep.1
374 store i32 %z, ptr addrspace(1) %out.gep.2
378 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
379 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32(
380 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(1) [[IN:%.*]], align 4
381 ; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
382 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
383 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
384 ; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
385 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[X1]], i32 0
386 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[Y2]], i32 1
387 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Z3]], i32 2
388 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[W4]], i32 3
389 ; CHECK-NEXT: store <4 x float> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
390 ; CHECK-NEXT: ret void
392 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
393 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
394 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
395 %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1
396 %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2
397 %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3
399 %x = load float, ptr addrspace(1) %in
400 %y = load float, ptr addrspace(1) %in.gep.1
401 %z = load float, ptr addrspace(1) %in.gep.2
402 %w = load float, ptr addrspace(1) %in.gep.3
404 store float %x, ptr addrspace(1) %out
405 store float %y, ptr addrspace(1) %out.gep.1
406 store float %z, ptr addrspace(1) %out.gep.2
407 store float %w, ptr addrspace(1) %out.gep.3
411 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
412 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base(
413 ; CHECK-NEXT: [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 11
414 ; CHECK-NEXT: [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 7
415 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4
416 ; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
417 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
418 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
419 ; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
420 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
421 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
422 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
423 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
424 ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT_GEP_0]], align 4
425 ; CHECK-NEXT: ret void
427 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
428 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
429 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
430 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14
431 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7
432 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8
433 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9
434 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10
436 %x = load i32, ptr addrspace(1) %in.gep.0
437 %y = load i32, ptr addrspace(1) %in.gep.1
438 %z = load i32, ptr addrspace(1) %in.gep.2
439 %w = load i32, ptr addrspace(1) %in.gep.3
441 store i32 %x, ptr addrspace(1) %out.gep.0
442 store i32 %y, ptr addrspace(1) %out.gep.1
443 store i32 %z, ptr addrspace(1) %out.gep.2
444 store i32 %w, ptr addrspace(1) %out.gep.3
448 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
449 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32(
450 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
451 ; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
452 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
453 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
454 ; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
455 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() #[[ATTR3:[0-9]+]]
456 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
457 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
458 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
459 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
460 ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
461 ; CHECK-NEXT: ret void
463 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
464 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
465 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
466 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
467 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
468 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
470 %x = load i32, ptr addrspace(1) %in
471 %y = load i32, ptr addrspace(1) %in.gep.1
472 %z = load i32, ptr addrspace(1) %in.gep.2
473 %w = load i32, ptr addrspace(1) %in.gep.3
475 ; Make sure the barrier doesn't stop this
476 tail call void @llvm.amdgcn.s.barrier() #1
478 store i32 %w, ptr addrspace(1) %out.gep.3
479 store i32 %z, ptr addrspace(1) %out.gep.2
480 store i32 %y, ptr addrspace(1) %out.gep.1
481 store i32 %x, ptr addrspace(1) %out
486 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
487 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32(
488 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
489 ; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
490 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
491 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
492 ; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
493 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() #[[ATTR3]]
494 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[W4]], i32 0
495 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z3]], i32 1
496 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 2
497 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[X1]], i32 3
498 ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
499 ; CHECK-NEXT: ret void
501 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
502 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
503 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
504 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
505 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
506 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
508 %x = load i32, ptr addrspace(1) %in
509 %y = load i32, ptr addrspace(1) %in.gep.1
510 %z = load i32, ptr addrspace(1) %in.gep.2
511 %w = load i32, ptr addrspace(1) %in.gep.3
513 ; Make sure the barrier doesn't stop this
514 tail call void @llvm.amdgcn.s.barrier() #1
516 store i32 %w, ptr addrspace(1) %out
517 store i32 %z, ptr addrspace(1) %out.gep.1
518 store i32 %y, ptr addrspace(1) %out.gep.2
519 store i32 %x, ptr addrspace(1) %out.gep.3
524 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
525 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8(
526 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 4
527 ; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
528 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
529 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
530 ; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
531 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
532 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1
533 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2
534 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3
535 ; CHECK-NEXT: store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
536 ; CHECK-NEXT: ret void
538 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
539 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
540 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
541 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
542 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
543 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
545 %x = load i8, ptr addrspace(1) %in, align 4
546 %y = load i8, ptr addrspace(1) %in.gep.1
547 %z = load i8, ptr addrspace(1) %in.gep.2
548 %w = load i8, ptr addrspace(1) %in.gep.3
550 store i8 %x, ptr addrspace(1) %out, align 4
551 store i8 %y, ptr addrspace(1) %out.gep.1
552 store i8 %z, ptr addrspace(1) %out.gep.2
553 store i8 %w, ptr addrspace(1) %out.gep.3
557 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
558 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align(
559 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 1
560 ; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
561 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
562 ; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
563 ; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
564 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
565 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1
566 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2
567 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3
568 ; CHECK-NEXT: store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 1
569 ; CHECK-NEXT: ret void
571 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
572 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
573 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
574 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
575 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
576 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
578 %x = load i8, ptr addrspace(1) %in
579 %y = load i8, ptr addrspace(1) %in.gep.1
580 %z = load i8, ptr addrspace(1) %in.gep.2
581 %w = load i8, ptr addrspace(1) %in.gep.3
583 store i8 %x, ptr addrspace(1) %out
584 store i8 %y, ptr addrspace(1) %out.gep.1
585 store i8 %z, ptr addrspace(1) %out.gep.2
586 store i8 %w, ptr addrspace(1) %out.gep.3
590 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
591 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32(
592 ; CHECK-NEXT: [[VEC:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 16
593 ; CHECK-NEXT: [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 0
594 ; CHECK-NEXT: [[Y:%.*]] = extractelement <4 x i32> [[VEC]], i32 1
595 ; CHECK-NEXT: [[Z:%.*]] = extractelement <4 x i32> [[VEC]], i32 2
596 ; CHECK-NEXT: [[W:%.*]] = extractelement <4 x i32> [[VEC]], i32 3
597 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
598 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Y]], i32 1
599 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z]], i32 2
600 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[W]], i32 3
601 ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
602 ; CHECK-NEXT: ret void
604 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
605 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
606 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
607 %vec = load <4 x i32>, ptr addrspace(1) %in
609 %x = extractelement <4 x i32> %vec, i32 0
610 %y = extractelement <4 x i32> %vec, i32 1
611 %z = extractelement <4 x i32> %vec, i32 2
612 %w = extractelement <4 x i32> %vec, i32 3
614 store i32 %x, ptr addrspace(1) %out
615 store i32 %y, ptr addrspace(1) %out.gep.1
616 store i32 %z, ptr addrspace(1) %out.gep.2
617 store i32 %w, ptr addrspace(1) %out.gep.3
621 define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
622 ; CHECK-LABEL: @merge_local_store_2_constants_i8(
623 ; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, ptr addrspace(3) [[OUT:%.*]], align 2
624 ; CHECK-NEXT: ret void
626 %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
628 store i8 123, ptr addrspace(3) %out.gep.1
629 store i8 456, ptr addrspace(3) %out, align 2
633 define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
634 ; CHECK-LABEL: @merge_local_store_2_constants_i32(
635 ; CHECK-NEXT: store <2 x i32> <i32 456, i32 123>, ptr addrspace(3) [[OUT:%.*]], align 4
636 ; CHECK-NEXT: ret void
638 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
640 store i32 123, ptr addrspace(3) %out.gep.1
641 store i32 456, ptr addrspace(3) %out
645 define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(ptr addrspace(3) %out) #0 {
646 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2(
647 ; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 1
648 ; CHECK-NEXT: store i32 123, ptr addrspace(3) [[OUT_GEP_1]], align 2
649 ; CHECK-NEXT: store i32 456, ptr addrspace(3) [[OUT]], align 2
650 ; CHECK-NEXT: ret void
652 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
654 store i32 123, ptr addrspace(3) %out.gep.1, align 2
655 store i32 456, ptr addrspace(3) %out, align 2
659 define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
660 ; CHECK-LABEL: @merge_local_store_4_constants_i32(
661 ; CHECK-NEXT: [[OUT_GEP_2:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 2
662 ; CHECK-NEXT: store <2 x i32> <i32 456, i32 333>, ptr addrspace(3) [[OUT_GEP_2]], align 4
663 ; CHECK-NEXT: store <2 x i32> <i32 1234, i32 123>, ptr addrspace(3) [[OUT]], align 4
664 ; CHECK-NEXT: ret void
666 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
667 %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
668 %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
670 store i32 123, ptr addrspace(3) %out.gep.1
671 store i32 456, ptr addrspace(3) %out.gep.2
672 store i32 333, ptr addrspace(3) %out.gep.3
673 store i32 1234, ptr addrspace(3) %out
677 define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
678 ; CHECK-LABEL: @merge_global_store_5_constants_i32(
679 ; CHECK-NEXT: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, ptr addrspace(1) [[OUT:%.*]], align 4
680 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
681 ; CHECK-NEXT: store i32 11, ptr addrspace(1) [[IDX4]], align 4
682 ; CHECK-NEXT: ret void
684 store i32 9, ptr addrspace(1) %out, align 4
685 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
686 store i32 12, ptr addrspace(1) %idx1, align 4
687 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
688 store i32 16, ptr addrspace(1) %idx2, align 4
689 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
690 store i32 -12, ptr addrspace(1) %idx3, align 4
691 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
692 store i32 11, ptr addrspace(1) %idx4, align 4
696 define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
697 ; CHECK-LABEL: @merge_global_store_6_constants_i32(
698 ; CHECK-NEXT: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, ptr addrspace(1) [[OUT:%.*]], align 4
699 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
700 ; CHECK-NEXT: store <2 x i32> <i32 11, i32 123>, ptr addrspace(1) [[IDX4]], align 4
701 ; CHECK-NEXT: ret void
703 store i32 13, ptr addrspace(1) %out, align 4
704 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
705 store i32 15, ptr addrspace(1) %idx1, align 4
706 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
707 store i32 62, ptr addrspace(1) %idx2, align 4
708 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
709 store i32 63, ptr addrspace(1) %idx3, align 4
710 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
711 store i32 11, ptr addrspace(1) %idx4, align 4
712 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
713 store i32 123, ptr addrspace(1) %idx5, align 4
717 define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
718 ; CHECK-LABEL: @merge_global_store_7_constants_i32(
719 ; CHECK-NEXT: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4
720 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
721 ; CHECK-NEXT: store <3 x i32> <i32 98, i32 91, i32 212>, ptr addrspace(1) [[IDX4]], align 4
722 ; CHECK-NEXT: ret void
724 store i32 34, ptr addrspace(1) %out, align 4
725 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
726 store i32 999, ptr addrspace(1) %idx1, align 4
727 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
728 store i32 65, ptr addrspace(1) %idx2, align 4
729 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
730 store i32 33, ptr addrspace(1) %idx3, align 4
731 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
732 store i32 98, ptr addrspace(1) %idx4, align 4
733 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
734 store i32 91, ptr addrspace(1) %idx5, align 4
735 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
736 store i32 212, ptr addrspace(1) %idx6, align 4
740 define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
741 ; CHECK-LABEL: @merge_global_store_8_constants_i32(
742 ; CHECK-NEXT: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4
743 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
744 ; CHECK-NEXT: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, ptr addrspace(1) [[IDX4]], align 4
745 ; CHECK-NEXT: ret void
747 store i32 34, ptr addrspace(1) %out, align 4
748 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
749 store i32 999, ptr addrspace(1) %idx1, align 4
750 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
751 store i32 65, ptr addrspace(1) %idx2, align 4
752 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
753 store i32 33, ptr addrspace(1) %idx3, align 4
754 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
755 store i32 98, ptr addrspace(1) %idx4, align 4
756 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
757 store i32 91, ptr addrspace(1) %idx5, align 4
758 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
759 store i32 212, ptr addrspace(1) %idx6, align 4
760 %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7
761 store i32 999, ptr addrspace(1) %idx7, align 4
765 define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
766 ; CHECK-LABEL: @copy_v3i32_align4(
767 ; CHECK-NEXT: [[VEC:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
768 ; CHECK-NEXT: store <3 x i32> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 16
769 ; CHECK-NEXT: ret void
771 %vec = load <3 x i32>, ptr addrspace(1) %in, align 4
772 store <3 x i32> %vec, ptr addrspace(1) %out
776 define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
777 ; CHECK-LABEL: @copy_v3i64_align4(
778 ; CHECK-NEXT: [[VEC:%.*]] = load <3 x i64>, ptr addrspace(1) [[IN:%.*]], align 4
779 ; CHECK-NEXT: store <3 x i64> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 32
780 ; CHECK-NEXT: ret void
782 %vec = load <3 x i64>, ptr addrspace(1) %in, align 4
783 store <3 x i64> %vec, ptr addrspace(1) %out
787 define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
788 ; CHECK-LABEL: @copy_v3f32_align4(
789 ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr addrspace(1) [[IN:%.*]], align 4
790 ; CHECK-NEXT: [[FADD:%.*]] = fadd <3 x float> [[VEC]], <float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
791 ; CHECK-NEXT: store <3 x float> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 16
792 ; CHECK-NEXT: ret void
794 %vec = load <3 x float>, ptr addrspace(1) %in, align 4
795 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
796 store <3 x float> %fadd, ptr addrspace(1) %out
800 define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
801 ; CHECK-LABEL: @copy_v3f64_align4(
802 ; CHECK-NEXT: [[VEC:%.*]] = load <3 x double>, ptr addrspace(1) [[IN:%.*]], align 4
803 ; CHECK-NEXT: [[FADD:%.*]] = fadd <3 x double> [[VEC]], <double 1.000000e+00, double 2.000000e+00, double 4.000000e+00>
804 ; CHECK-NEXT: store <3 x double> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 32
805 ; CHECK-NEXT: ret void
807 %vec = load <3 x double>, ptr addrspace(1) %in, align 4
808 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
809 store <3 x double> %fadd, ptr addrspace(1) %out
813 ; Verify that we no longer hit asserts for this test case. No change expected.
814 define amdgpu_kernel void @copy_vec_of_ptrs(ptr addrspace(1) %out,
815 ; CHECK-LABEL: @copy_vec_of_ptrs(
816 ; CHECK-NEXT: [[IN_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[IN:%.*]], i32 1
817 ; CHECK-NEXT: [[VEC1:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN_GEP_1]], align 16
818 ; CHECK-NEXT: [[VEC2:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN]], align 4
819 ; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[OUT:%.*]], i32 1
820 ; CHECK-NEXT: store <2 x ptr> [[VEC1]], ptr addrspace(1) [[OUT_GEP_1]], align 16
821 ; CHECK-NEXT: store <2 x ptr> [[VEC2]], ptr addrspace(1) [[OUT]], align 4
822 ; CHECK-NEXT: ret void
824 ptr addrspace(1) %in ) #0 {
825 %in.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %in, i32 1
826 %vec1 = load <2 x ptr>, ptr addrspace(1) %in.gep.1
827 %vec2 = load <2 x ptr>, ptr addrspace(1) %in, align 4
829 %out.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %out, i32 1
830 store <2 x ptr> %vec1, ptr addrspace(1) %out.gep.1
831 store <2 x ptr> %vec2, ptr addrspace(1) %out, align 4
835 declare void @llvm.amdgcn.s.barrier() #1
837 attributes #0 = { nounwind }
838 attributes #1 = { convergent nounwind }