1 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
6 ; FIXME: Should not vectorize on gfx8
8 ; GCN-LABEL: @fadd_combine_v2f16
10 define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
12 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
13 %tmp1 = zext i32 %tmp to i64
14 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
15 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
16 %tmp4 = fadd half %tmp3, 1.000000e+00
17 store half %tmp4, ptr addrspace(1) %tmp2, align 2
18 %tmp5 = add nuw nsw i64 %tmp1, 1
19 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
20 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
21 %tmp8 = fadd half %tmp7, 1.000000e+00
22 store half %tmp8, ptr addrspace(1) %tmp6, align 2
26 ; FIXME: Should not vectorize on gfx8
27 ; GCN-LABEL: @fsub_combine_v2f16
28 ; GCN: fsub <2 x half>
29 define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
31 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
32 %tmp1 = zext i32 %tmp to i64
33 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
34 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
35 %tmp4 = fsub half %tmp3, 1.000000e+00
36 store half %tmp4, ptr addrspace(1) %tmp2, align 2
37 %tmp5 = add nuw nsw i64 %tmp1, 1
38 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
39 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
40 %tmp8 = fsub half %tmp7, 1.000000e+00
41 store half %tmp8, ptr addrspace(1) %tmp6, align 2
45 ; FIXME: Should not vectorize on gfx8
46 ; GCN-LABEL: @fmul_combine_v2f16
47 ; GCN: fmul <2 x half>
48 define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
50 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
51 %tmp1 = zext i32 %tmp to i64
52 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
53 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
54 %tmp4 = fmul half %tmp3, 1.000000e+00
55 store half %tmp4, ptr addrspace(1) %tmp2, align 2
56 %tmp5 = add nuw nsw i64 %tmp1, 1
57 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
58 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
59 %tmp8 = fmul half %tmp7, 1.000000e+00
60 store half %tmp8, ptr addrspace(1) %tmp6, align 2
64 ; GCN-LABEL: @fdiv_combine_v2f16
65 ; GCN: fdiv <2 x half>
66 define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
68 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
69 %tmp1 = zext i32 %tmp to i64
70 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
71 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
72 %tmp4 = fdiv half %tmp3, 1.000000e+00
73 store half %tmp4, ptr addrspace(1) %tmp2, align 2
74 %tmp5 = add nuw nsw i64 %tmp1, 1
75 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
76 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
77 %tmp8 = fdiv half %tmp7, 1.000000e+00
78 store half %tmp8, ptr addrspace(1) %tmp6, align 2
82 ; GCN-LABEL: @frem_combine_v2f16
83 ; GCN: frem <2 x half>
84 define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
86 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
87 %tmp1 = zext i32 %tmp to i64
88 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
89 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
90 %tmp4 = frem half %tmp3, 1.000000e+00
91 store half %tmp4, ptr addrspace(1) %tmp2, align 2
92 %tmp5 = add nuw nsw i64 %tmp1, 1
93 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
94 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
95 %tmp8 = frem half %tmp7, 1.000000e+00
96 store half %tmp8, ptr addrspace(1) %tmp6, align 2
100 ; FIXME: Should not vectorize on gfx8
101 ; GCN-LABEL: @fma_combine_v2f16
102 ; GCN: call <2 x half> @llvm.fma.v2f16
103 define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
105 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
106 %tmp1 = zext i32 %tmp to i64
107 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
108 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
109 %tmp4 = tail call half @llvm.fma.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
110 store half %tmp4, ptr addrspace(1) %tmp2, align 2
111 %tmp5 = add nuw nsw i64 %tmp1, 1
112 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
113 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
114 %tmp8 = tail call half @llvm.fma.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
115 store half %tmp8, ptr addrspace(1) %tmp6, align 2
119 ; FIXME: Should not vectorize on gfx8
120 ; GCN-LABEL: @fmuladd_combine_v2f16
121 ; GCN: call <2 x half> @llvm.fmuladd.v2f16
122 define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
124 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
125 %tmp1 = zext i32 %tmp to i64
126 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
127 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
128 %tmp4 = tail call half @llvm.fmuladd.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
129 store half %tmp4, ptr addrspace(1) %tmp2, align 2
130 %tmp5 = add nuw nsw i64 %tmp1, 1
131 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
132 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
133 %tmp8 = tail call half @llvm.fmuladd.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
134 store half %tmp8, ptr addrspace(1) %tmp6, align 2
138 ; GCN-LABEL: @minnum_combine_v2f16
139 ; GFX8: call half @llvm.minnum.f16(
140 ; GFX8: call half @llvm.minnum.f16(
142 ; GFX9: call <2 x half> @llvm.minnum.v2f16
143 define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
145 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
146 %tmp1 = zext i32 %tmp to i64
147 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
148 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
149 %tmp4 = call half @llvm.minnum.f16(half %tmp3, half 1.000000e+00)
150 store half %tmp4, ptr addrspace(1) %tmp2, align 2
151 %tmp5 = add nuw nsw i64 %tmp1, 1
152 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
153 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
154 %tmp8 = call half @llvm.minnum.f16(half %tmp7, half 1.000000e+00)
155 store half %tmp8, ptr addrspace(1) %tmp6, align 2
159 ; GCN-LABEL: @maxnum_combine_v2f16
160 ; GFX8: call half @llvm.maxnum.f16(
161 ; GFX8: call half @llvm.maxnum.f16(
163 ; GFX9: call <2 x half> @llvm.maxnum.v2f16
164 define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
166 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
167 %tmp1 = zext i32 %tmp to i64
168 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
169 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
170 %tmp4 = call half @llvm.maxnum.f16(half %tmp3, half 1.000000e+00)
171 store half %tmp4, ptr addrspace(1) %tmp2, align 2
172 %tmp5 = add nuw nsw i64 %tmp1, 1
173 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
174 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
175 %tmp8 = call half @llvm.maxnum.f16(half %tmp7, half 1.000000e+00)
176 store half %tmp8, ptr addrspace(1) %tmp6, align 2
180 ; FIXME: Should vectorize
181 ; GCN-LABEL: @minimum_combine_v2f16
182 ; GCN: call half @llvm.minimum.f16(
183 ; GCN: call half @llvm.minimum.f16(
184 define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
186 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
187 %tmp1 = zext i32 %tmp to i64
188 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
189 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
190 %tmp4 = call half @llvm.minimum.f16(half %tmp3, half 1.000000e+00)
191 store half %tmp4, ptr addrspace(1) %tmp2, align 2
192 %tmp5 = add nuw nsw i64 %tmp1, 1
193 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
194 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
195 %tmp8 = call half @llvm.minimum.f16(half %tmp7, half 1.000000e+00)
196 store half %tmp8, ptr addrspace(1) %tmp6, align 2
200 ; GCN-LABEL: @maximum_combine_v2f16
201 ; GCN: call half @llvm.maximum.f16(
202 ; GCN: call half @llvm.maximum.f16(
203 define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
205 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
206 %tmp1 = zext i32 %tmp to i64
207 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
208 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
209 %tmp4 = call half @llvm.maximum.f16(half %tmp3, half 1.000000e+00)
210 store half %tmp4, ptr addrspace(1) %tmp2, align 2
211 %tmp5 = add nuw nsw i64 %tmp1, 1
212 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
213 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
214 %tmp8 = call half @llvm.maximum.f16(half %tmp7, half 1.000000e+00)
215 store half %tmp8, ptr addrspace(1) %tmp6, align 2
219 ; GCN-LABEL: @canonicalize_combine_v2f16
220 ; GCN: call <2 x half> @llvm.canonicalize.v2f16(
221 define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
223 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
224 %tmp1 = zext i32 %tmp to i64
225 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
226 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
227 %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
228 store half %tmp4, ptr addrspace(1) %tmp2, align 2
229 %tmp5 = add nuw nsw i64 %tmp1, 1
230 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
231 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
232 %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
233 store half %tmp8, ptr addrspace(1) %tmp6, align 2
237 ; GCN-LABEL: @fabs_combine_v2f16
238 ; GCN: call <2 x half> @llvm.fabs.v2f16(
239 define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
241 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
242 %tmp1 = zext i32 %tmp to i64
243 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
244 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
245 %tmp4 = call half @llvm.fabs.f16(half %tmp3)
246 store half %tmp4, ptr addrspace(1) %tmp2, align 2
247 %tmp5 = add nuw nsw i64 %tmp1, 1
248 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
249 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
250 %tmp8 = call half @llvm.fabs.f16(half %tmp7)
251 store half %tmp8, ptr addrspace(1) %tmp6, align 2
255 ; GCN-LABEL: @fneg_combine_v2f16
256 ; GCN: fneg <2 x half>
257 define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
259 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
260 %tmp1 = zext i32 %tmp to i64
261 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
262 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
263 %tmp4 = fneg half %tmp3
264 store half %tmp4, ptr addrspace(1) %tmp2, align 2
265 %tmp5 = add nuw nsw i64 %tmp1, 1
266 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
267 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
268 %tmp8 = fneg half %tmp7
269 store half %tmp8, ptr addrspace(1) %tmp6, align 2
273 ; GCN-LABEL: @copysign_combine_v2f16
274 ; GCN: call <2 x half> @llvm.copysign.v2f16(
275 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
277 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
278 %tmp1 = zext i32 %tmp to i64
279 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
280 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
281 %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
282 store half %tmp4, ptr addrspace(1) %tmp2, align 2
283 %tmp5 = add nuw nsw i64 %tmp1, 1
284 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
285 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
286 %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
287 store half %tmp8, ptr addrspace(1) %tmp6, align 2
291 ; FIXME: Should always vectorize
292 ; GCN-LABEL: @copysign_combine_v4f16
293 ; GCN: call <2 x half> @llvm.copysign.v2f16(
295 ; GFX8: call half @llvm.copysign.f16(
296 ; GFX8: call half @llvm.copysign.f16(
298 ; GFX9: call <2 x half> @llvm.copysign.v2f16(
299 define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
301 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
302 %tmp1 = zext i32 %tmp to i64
304 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
305 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
306 %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
307 store half %tmp4, ptr addrspace(1) %tmp2, align 2
309 %tmp5 = add nuw nsw i64 %tmp1, 1
310 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
311 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
312 %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
313 store half %tmp8, ptr addrspace(1) %tmp6, align 2
315 %tmp9 = add nuw nsw i64 %tmp1, 2
316 %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
317 %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
318 %tmp12 = call half @llvm.copysign.f16(half %tmp11, half %sign)
319 store half %tmp12, ptr addrspace(1) %tmp10, align 2
321 %tmp13 = add nuw nsw i64 %tmp1, 3
322 %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
323 %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
324 %tmp16 = call half @llvm.copysign.f16(half %tmp15, half %sign)
325 store half %tmp16, ptr addrspace(1) %tmp14, align 2
329 ; GCN-LABEL: @canonicalize_combine_v4f16
330 ; GCN: call <2 x half> @llvm.canonicalize.v2f16(
331 ; GCN: call <2 x half> @llvm.canonicalize.v2f16(
332 define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
334 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
335 %tmp1 = zext i32 %tmp to i64
337 %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
338 %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
339 %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
340 store half %tmp4, ptr addrspace(1) %tmp2, align 2
342 %tmp5 = add nuw nsw i64 %tmp1, 1
343 %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
344 %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
345 %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
346 store half %tmp8, ptr addrspace(1) %tmp6, align 2
348 %tmp9 = add nuw nsw i64 %tmp1, 2
349 %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
350 %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
351 %tmp12 = call half @llvm.canonicalize.f16(half %tmp11)
352 store half %tmp12, ptr addrspace(1) %tmp10, align 2
354 %tmp13 = add nuw nsw i64 %tmp1, 3
355 %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
356 %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
357 %tmp16 = call half @llvm.canonicalize.f16(half %tmp15)
358 store half %tmp16, ptr addrspace(1) %tmp14, align 2