1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
4 ; GCN-LABEL: ds_read32_combine_stride_400:
5 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
6 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
8 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
9 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
10 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
12 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
13 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
14 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
16 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
17 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
18 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
19 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
20 define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
22 %tmp = load float, float addrspace(3)* %arg, align 4
23 %tmp2 = fadd float %tmp, 0.000000e+00
24 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
25 %tmp4 = load float, float addrspace(3)* %tmp3, align 4
26 %tmp5 = fadd float %tmp2, %tmp4
27 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
28 %tmp7 = load float, float addrspace(3)* %tmp6, align 4
29 %tmp8 = fadd float %tmp5, %tmp7
30 %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
31 %tmp10 = load float, float addrspace(3)* %tmp9, align 4
32 %tmp11 = fadd float %tmp8, %tmp10
33 %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
34 %tmp13 = load float, float addrspace(3)* %tmp12, align 4
35 %tmp14 = fadd float %tmp11, %tmp13
36 %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
37 %tmp16 = load float, float addrspace(3)* %tmp15, align 4
38 %tmp17 = fadd float %tmp14, %tmp16
39 %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
40 %tmp19 = load float, float addrspace(3)* %tmp18, align 4
41 %tmp20 = fadd float %tmp17, %tmp19
42 %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
43 %tmp22 = load float, float addrspace(3)* %tmp21, align 4
44 %tmp23 = fadd float %tmp20, %tmp22
45 store float %tmp23, float *%arg1, align 4
49 ; GCN-LABEL: ds_read32_combine_stride_400_back:
50 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
51 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
53 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
54 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
55 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
57 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
58 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
59 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
61 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
62 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
63 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
64 ; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
65 define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
67 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
68 %tmp2 = load float, float addrspace(3)* %tmp, align 4
69 %tmp3 = fadd float %tmp2, 0.000000e+00
70 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
71 %tmp5 = load float, float addrspace(3)* %tmp4, align 4
72 %tmp6 = fadd float %tmp3, %tmp5
73 %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
74 %tmp8 = load float, float addrspace(3)* %tmp7, align 4
75 %tmp9 = fadd float %tmp6, %tmp8
76 %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
77 %tmp11 = load float, float addrspace(3)* %tmp10, align 4
78 %tmp12 = fadd float %tmp9, %tmp11
79 %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
80 %tmp14 = load float, float addrspace(3)* %tmp13, align 4
81 %tmp15 = fadd float %tmp12, %tmp14
82 %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
83 %tmp17 = load float, float addrspace(3)* %tmp16, align 4
84 %tmp18 = fadd float %tmp15, %tmp17
85 %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
86 %tmp20 = load float, float addrspace(3)* %tmp19, align 4
87 %tmp21 = fadd float %tmp18, %tmp20
88 %tmp22 = load float, float addrspace(3)* %arg, align 4
89 %tmp23 = fadd float %tmp21, %tmp22
90 store float %tmp23, float *%arg1, align 4
94 ; GCN-LABEL: ds_read32_combine_stride_8192:
95 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
96 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
97 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32
98 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96
99 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160
100 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224
101 define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
103 %tmp = load float, float addrspace(3)* %arg, align 4
104 %tmp2 = fadd float %tmp, 0.000000e+00
105 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
106 %tmp4 = load float, float addrspace(3)* %tmp3, align 4
107 %tmp5 = fadd float %tmp2, %tmp4
108 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
109 %tmp7 = load float, float addrspace(3)* %tmp6, align 4
110 %tmp8 = fadd float %tmp5, %tmp7
111 %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
112 %tmp10 = load float, float addrspace(3)* %tmp9, align 4
113 %tmp11 = fadd float %tmp8, %tmp10
114 %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
115 %tmp13 = load float, float addrspace(3)* %tmp12, align 4
116 %tmp14 = fadd float %tmp11, %tmp13
117 %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
118 %tmp16 = load float, float addrspace(3)* %tmp15, align 4
119 %tmp17 = fadd float %tmp14, %tmp16
120 %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
121 %tmp19 = load float, float addrspace(3)* %tmp18, align 4
122 %tmp20 = fadd float %tmp17, %tmp19
123 %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
124 %tmp22 = load float, float addrspace(3)* %tmp21, align 4
125 %tmp23 = fadd float %tmp20, %tmp22
126 store float %tmp23, float *%arg1, align 4
130 ; GCN-LABEL: ds_read32_combine_stride_8192_shifted:
131 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
132 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
134 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
135 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
136 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
138 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
139 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
140 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
142 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32
143 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32
144 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32
145 define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
147 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
148 %tmp2 = load float, float addrspace(3)* %tmp, align 4
149 %tmp3 = fadd float %tmp2, 0.000000e+00
150 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050
151 %tmp5 = load float, float addrspace(3)* %tmp4, align 4
152 %tmp6 = fadd float %tmp3, %tmp5
153 %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098
154 %tmp8 = load float, float addrspace(3)* %tmp7, align 4
155 %tmp9 = fadd float %tmp6, %tmp8
156 %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146
157 %tmp11 = load float, float addrspace(3)* %tmp10, align 4
158 %tmp12 = fadd float %tmp9, %tmp11
159 %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194
160 %tmp14 = load float, float addrspace(3)* %tmp13, align 4
161 %tmp15 = fadd float %tmp12, %tmp14
162 %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242
163 %tmp17 = load float, float addrspace(3)* %tmp16, align 4
164 %tmp18 = fadd float %tmp15, %tmp17
165 store float %tmp18, float *%arg1, align 4
169 ; GCN-LABEL: ds_read64_combine_stride_400:
170 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
171 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
173 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
174 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
176 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
177 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150
178 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250
179 ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50
180 define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
182 %tmp = load double, double addrspace(3)* %arg, align 8
183 %tmp2 = fadd double %tmp, 0.000000e+00
184 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
185 %tmp4 = load double, double addrspace(3)* %tmp3, align 8
186 %tmp5 = fadd double %tmp2, %tmp4
187 %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
188 %tmp7 = load double, double addrspace(3)* %tmp6, align 8
189 %tmp8 = fadd double %tmp5, %tmp7
190 %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
191 %tmp10 = load double, double addrspace(3)* %tmp9, align 8
192 %tmp11 = fadd double %tmp8, %tmp10
193 %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
194 %tmp13 = load double, double addrspace(3)* %tmp12, align 8
195 %tmp14 = fadd double %tmp11, %tmp13
196 %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
197 %tmp16 = load double, double addrspace(3)* %tmp15, align 8
198 %tmp17 = fadd double %tmp14, %tmp16
199 %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
200 %tmp19 = load double, double addrspace(3)* %tmp18, align 8
201 %tmp20 = fadd double %tmp17, %tmp19
202 %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
203 %tmp22 = load double, double addrspace(3)* %tmp21, align 8
204 %tmp23 = fadd double %tmp20, %tmp22
205 store double %tmp23, double *%arg1, align 8
209 ; GCN-LABEL: ds_read64_combine_stride_8192_shifted:
210 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
211 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
213 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
214 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
215 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
217 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
218 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
219 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
221 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16
222 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16
223 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16
224 define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
226 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
227 %tmp2 = load double, double addrspace(3)* %tmp, align 8
228 %tmp3 = fadd double %tmp2, 0.000000e+00
229 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
230 %tmp5 = load double, double addrspace(3)* %tmp4, align 8
231 %tmp6 = fadd double %tmp3, %tmp5
232 %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
233 %tmp8 = load double, double addrspace(3)* %tmp7, align 8
234 %tmp9 = fadd double %tmp6, %tmp8
235 %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
236 %tmp11 = load double, double addrspace(3)* %tmp10, align 8
237 %tmp12 = fadd double %tmp9, %tmp11
238 %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
239 %tmp14 = load double, double addrspace(3)* %tmp13, align 8
240 %tmp15 = fadd double %tmp12, %tmp14
241 %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
242 %tmp17 = load double, double addrspace(3)* %tmp16, align 8
243 %tmp18 = fadd double %tmp15, %tmp17
244 store double %tmp18, double *%arg1, align 8
248 ; GCN-LABEL: ds_write32_combine_stride_400:
249 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
250 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
252 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
253 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
254 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
256 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
257 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
258 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
260 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
261 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
262 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
263 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
264 define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) {
266 store float 1.000000e+00, float addrspace(3)* %arg, align 4
267 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
268 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
269 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
270 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
271 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
272 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
273 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
274 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
275 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
276 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
277 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
278 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
279 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
280 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
284 ; GCN-LABEL: ds_write32_combine_stride_400_back:
285 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
286 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
288 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
289 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
290 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
292 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
293 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
294 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
296 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
297 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
298 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
299 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
300 define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) {
302 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
303 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
304 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
305 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
306 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
307 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
308 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
309 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
310 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
311 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
312 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
313 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
314 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
315 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
316 store float 1.000000e+00, float addrspace(3)* %arg, align 4
320 ; GCN-LABEL: ds_write32_combine_stride_8192:
321 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
322 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
323 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
324 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
325 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
326 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224
327 define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) {
329 store float 1.000000e+00, float addrspace(3)* %arg, align 4
330 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
331 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
332 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
333 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
334 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
335 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
336 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
337 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
338 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
339 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
340 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
341 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
342 %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
343 store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
347 ; GCN-LABEL: ds_write32_combine_stride_8192_shifted:
348 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
349 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
351 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
352 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
353 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
355 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]]
356 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]]
357 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]]
359 ; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
360 ; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
361 ; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
362 define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) {
364 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
365 store float 1.000000e+00, float addrspace(3)* %tmp, align 4
366 %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049
367 store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
368 %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097
369 store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
370 %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145
371 store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
372 %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193
373 store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
374 %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241
375 store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
379 ; GCN-LABEL: ds_write64_combine_stride_400:
380 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
381 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
383 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
384 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
386 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
387 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150
388 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250
389 ; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
390 define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) {
392 store double 1.000000e+00, double addrspace(3)* %arg, align 8
393 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
394 store double 1.000000e+00, double addrspace(3)* %tmp, align 8
395 %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
396 store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
397 %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
398 store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
399 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
400 store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
401 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
402 store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
403 %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
404 store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
405 %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
406 store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
410 ; GCN-LABEL: ds_write64_combine_stride_8192_shifted:
411 ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
412 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
414 ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
415 ; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
416 ; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
418 ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
419 ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
420 ; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
422 ; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
423 ; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
424 ; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
425 define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) {
427 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
428 store double 1.000000e+00, double addrspace(3)* %tmp, align 8
429 %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
430 store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
431 %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
432 store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
433 %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
434 store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
435 %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
436 store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
437 %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
438 store double 1.000000e+00, double addrspace(3)* %tmp5, align 8