1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic -mattr=+fullfp16 < %s | FileCheck --check-prefixes=CHECK,FULLFP16 %s
3 ; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic < %s | FileCheck %s --check-prefixes=CHECK,CHECKNOFP16
5 define float @add_HalfS(<2 x float> %bin.rdx) {
6 ; CHECK-LABEL: add_HalfS:
8 ; CHECK-NEXT: faddp s0, v0.2s
10 %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
14 define half @add_HalfH(<4 x half> %bin.rdx) {
15 ; FULLFP16-LABEL: add_HalfH:
17 ; FULLFP16-NEXT: faddp v0.4h, v0.4h, v0.4h
18 ; FULLFP16-NEXT: faddp h0, v0.2h
21 ; CHECKNOFP16-LABEL: add_HalfH:
22 ; CHECKNOFP16: // %bb.0:
23 ; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
24 ; CHECKNOFP16-NEXT: mov h1, v0.h[1]
25 ; CHECKNOFP16-NEXT: fcvt s2, h0
26 ; CHECKNOFP16-NEXT: fcvt s1, h1
27 ; CHECKNOFP16-NEXT: fadd s1, s2, s1
28 ; CHECKNOFP16-NEXT: mov h2, v0.h[2]
29 ; CHECKNOFP16-NEXT: mov h0, v0.h[3]
30 ; CHECKNOFP16-NEXT: fcvt h1, s1
31 ; CHECKNOFP16-NEXT: fcvt s2, h2
32 ; CHECKNOFP16-NEXT: fcvt s0, h0
33 ; CHECKNOFP16-NEXT: fcvt s1, h1
34 ; CHECKNOFP16-NEXT: fadd s1, s1, s2
35 ; CHECKNOFP16-NEXT: fcvt h1, s1
36 ; CHECKNOFP16-NEXT: fcvt s1, h1
37 ; CHECKNOFP16-NEXT: fadd s0, s1, s0
38 ; CHECKNOFP16-NEXT: fcvt h0, s0
39 ; CHECKNOFP16-NEXT: ret
40 %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
45 define half @add_H(<8 x half> %bin.rdx) {
46 ; FULLFP16-LABEL: add_H:
48 ; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h
49 ; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h
50 ; FULLFP16-NEXT: faddp h0, v0.2h
53 ; CHECKNOFP16-LABEL: add_H:
54 ; CHECKNOFP16: // %bb.0:
55 ; CHECKNOFP16-NEXT: mov h1, v0.h[1]
56 ; CHECKNOFP16-NEXT: fcvt s2, h0
57 ; CHECKNOFP16-NEXT: fcvt s1, h1
58 ; CHECKNOFP16-NEXT: fadd s1, s2, s1
59 ; CHECKNOFP16-NEXT: mov h2, v0.h[2]
60 ; CHECKNOFP16-NEXT: fcvt h1, s1
61 ; CHECKNOFP16-NEXT: fcvt s2, h2
62 ; CHECKNOFP16-NEXT: fcvt s1, h1
63 ; CHECKNOFP16-NEXT: fadd s1, s1, s2
64 ; CHECKNOFP16-NEXT: mov h2, v0.h[3]
65 ; CHECKNOFP16-NEXT: fcvt h1, s1
66 ; CHECKNOFP16-NEXT: fcvt s2, h2
67 ; CHECKNOFP16-NEXT: fcvt s1, h1
68 ; CHECKNOFP16-NEXT: fadd s1, s1, s2
69 ; CHECKNOFP16-NEXT: mov h2, v0.h[4]
70 ; CHECKNOFP16-NEXT: fcvt h1, s1
71 ; CHECKNOFP16-NEXT: fcvt s2, h2
72 ; CHECKNOFP16-NEXT: fcvt s1, h1
73 ; CHECKNOFP16-NEXT: fadd s1, s1, s2
74 ; CHECKNOFP16-NEXT: mov h2, v0.h[5]
75 ; CHECKNOFP16-NEXT: fcvt h1, s1
76 ; CHECKNOFP16-NEXT: fcvt s2, h2
77 ; CHECKNOFP16-NEXT: fcvt s1, h1
78 ; CHECKNOFP16-NEXT: fadd s1, s1, s2
79 ; CHECKNOFP16-NEXT: mov h2, v0.h[6]
80 ; CHECKNOFP16-NEXT: mov h0, v0.h[7]
81 ; CHECKNOFP16-NEXT: fcvt h1, s1
82 ; CHECKNOFP16-NEXT: fcvt s2, h2
83 ; CHECKNOFP16-NEXT: fcvt s0, h0
84 ; CHECKNOFP16-NEXT: fcvt s1, h1
85 ; CHECKNOFP16-NEXT: fadd s1, s1, s2
86 ; CHECKNOFP16-NEXT: fcvt h1, s1
87 ; CHECKNOFP16-NEXT: fcvt s1, h1
88 ; CHECKNOFP16-NEXT: fadd s0, s1, s0
89 ; CHECKNOFP16-NEXT: fcvt h0, s0
90 ; CHECKNOFP16-NEXT: ret
91 %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
95 define float @add_S(<4 x float> %bin.rdx) {
98 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
99 ; CHECK-NEXT: faddp s0, v0.2s
101 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
105 define double @add_D(<2 x double> %bin.rdx) {
106 ; CHECK-LABEL: add_D:
108 ; CHECK-NEXT: faddp d0, v0.2d
110 %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
114 define half @add_2H(<16 x half> %bin.rdx) {
115 ; FULLFP16-LABEL: add_2H:
116 ; FULLFP16: // %bb.0:
117 ; FULLFP16-NEXT: fadd v0.8h, v0.8h, v1.8h
118 ; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h
119 ; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h
120 ; FULLFP16-NEXT: faddp h0, v0.2h
123 ; CHECKNOFP16-LABEL: add_2H:
124 ; CHECKNOFP16: // %bb.0:
125 ; CHECKNOFP16-NEXT: mov h2, v1.h[1]
126 ; CHECKNOFP16-NEXT: mov h3, v0.h[1]
127 ; CHECKNOFP16-NEXT: fcvt s4, h1
128 ; CHECKNOFP16-NEXT: fcvt s5, h0
129 ; CHECKNOFP16-NEXT: fcvt s2, h2
130 ; CHECKNOFP16-NEXT: fcvt s3, h3
131 ; CHECKNOFP16-NEXT: fadd s4, s5, s4
132 ; CHECKNOFP16-NEXT: mov h5, v0.h[2]
133 ; CHECKNOFP16-NEXT: fadd s2, s3, s2
134 ; CHECKNOFP16-NEXT: mov h3, v1.h[2]
135 ; CHECKNOFP16-NEXT: fcvt h4, s4
136 ; CHECKNOFP16-NEXT: fcvt s5, h5
137 ; CHECKNOFP16-NEXT: fcvt h2, s2
138 ; CHECKNOFP16-NEXT: fcvt s3, h3
139 ; CHECKNOFP16-NEXT: fcvt s4, h4
140 ; CHECKNOFP16-NEXT: fcvt s2, h2
141 ; CHECKNOFP16-NEXT: fadd s3, s5, s3
142 ; CHECKNOFP16-NEXT: mov h5, v0.h[3]
143 ; CHECKNOFP16-NEXT: fadd s2, s4, s2
144 ; CHECKNOFP16-NEXT: mov h4, v1.h[3]
145 ; CHECKNOFP16-NEXT: fcvt h3, s3
146 ; CHECKNOFP16-NEXT: fcvt s5, h5
147 ; CHECKNOFP16-NEXT: fcvt h2, s2
148 ; CHECKNOFP16-NEXT: fcvt s4, h4
149 ; CHECKNOFP16-NEXT: fcvt s3, h3
150 ; CHECKNOFP16-NEXT: fcvt s2, h2
151 ; CHECKNOFP16-NEXT: fadd s4, s5, s4
152 ; CHECKNOFP16-NEXT: mov h5, v0.h[4]
153 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
154 ; CHECKNOFP16-NEXT: mov h3, v1.h[4]
155 ; CHECKNOFP16-NEXT: fcvt h4, s4
156 ; CHECKNOFP16-NEXT: fcvt s5, h5
157 ; CHECKNOFP16-NEXT: fcvt h2, s2
158 ; CHECKNOFP16-NEXT: fcvt s3, h3
159 ; CHECKNOFP16-NEXT: fcvt s4, h4
160 ; CHECKNOFP16-NEXT: fcvt s2, h2
161 ; CHECKNOFP16-NEXT: fadd s3, s5, s3
162 ; CHECKNOFP16-NEXT: mov h5, v0.h[5]
163 ; CHECKNOFP16-NEXT: fadd s2, s2, s4
164 ; CHECKNOFP16-NEXT: mov h4, v1.h[5]
165 ; CHECKNOFP16-NEXT: fcvt h3, s3
166 ; CHECKNOFP16-NEXT: fcvt s5, h5
167 ; CHECKNOFP16-NEXT: fcvt h2, s2
168 ; CHECKNOFP16-NEXT: fcvt s4, h4
169 ; CHECKNOFP16-NEXT: fcvt s3, h3
170 ; CHECKNOFP16-NEXT: fcvt s2, h2
171 ; CHECKNOFP16-NEXT: fadd s4, s5, s4
172 ; CHECKNOFP16-NEXT: mov h5, v0.h[6]
173 ; CHECKNOFP16-NEXT: mov h0, v0.h[7]
174 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
175 ; CHECKNOFP16-NEXT: fcvt h3, s4
176 ; CHECKNOFP16-NEXT: mov h4, v1.h[6]
177 ; CHECKNOFP16-NEXT: fcvt s5, h5
178 ; CHECKNOFP16-NEXT: mov h1, v1.h[7]
179 ; CHECKNOFP16-NEXT: fcvt s0, h0
180 ; CHECKNOFP16-NEXT: fcvt h2, s2
181 ; CHECKNOFP16-NEXT: fcvt s3, h3
182 ; CHECKNOFP16-NEXT: fcvt s4, h4
183 ; CHECKNOFP16-NEXT: fcvt s1, h1
184 ; CHECKNOFP16-NEXT: fcvt s2, h2
185 ; CHECKNOFP16-NEXT: fadd s0, s0, s1
186 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
187 ; CHECKNOFP16-NEXT: fadd s3, s5, s4
188 ; CHECKNOFP16-NEXT: fcvt h0, s0
189 ; CHECKNOFP16-NEXT: fcvt h2, s2
190 ; CHECKNOFP16-NEXT: fcvt h3, s3
191 ; CHECKNOFP16-NEXT: fcvt s0, h0
192 ; CHECKNOFP16-NEXT: fcvt s2, h2
193 ; CHECKNOFP16-NEXT: fcvt s3, h3
194 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
195 ; CHECKNOFP16-NEXT: fcvt h1, s2
196 ; CHECKNOFP16-NEXT: fcvt s1, h1
197 ; CHECKNOFP16-NEXT: fadd s0, s1, s0
198 ; CHECKNOFP16-NEXT: fcvt h0, s0
199 ; CHECKNOFP16-NEXT: ret
200 %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx)
204 define float @add_2S(<8 x float> %bin.rdx) {
205 ; CHECK-LABEL: add_2S:
207 ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
208 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
209 ; CHECK-NEXT: faddp s0, v0.2s
211 %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
215 define double @add_2D(<4 x double> %bin.rdx) {
216 ; CHECK-LABEL: add_2D:
218 ; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
219 ; CHECK-NEXT: faddp d0, v0.2d
221 %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
225 ; Added at least one test where the start value is not -0.0.
226 define float @add_S_init_42(<4 x float> %bin.rdx) {
227 ; CHECK-LABEL: add_S_init_42:
229 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
230 ; CHECK-NEXT: mov w8, #1109917696 // =0x42280000
231 ; CHECK-NEXT: fmov s1, w8
232 ; CHECK-NEXT: faddp s0, v0.2s
233 ; CHECK-NEXT: fadd s0, s0, s1
235 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
239 ; The faddp.4s in the loop should not use v0.4s as second operand,
240 ; because this introduces an unnecessary cross-iteration dependency.
241 define float @fadd_reduction_v4f32_in_loop(ptr %ptr.start) {
242 ; CHECK-LABEL: fadd_reduction_v4f32_in_loop:
243 ; CHECK: // %bb.0: // %entry
244 ; CHECK-NEXT: movi d0, #0000000000000000
245 ; CHECK-NEXT: mov x8, xzr
246 ; CHECK-NEXT: .LBB9_1: // %loop
247 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
248 ; CHECK-NEXT: ldr q1, [x0, x8]
249 ; CHECK-NEXT: add x8, x8, #16
250 ; CHECK-NEXT: cmp w8, #112
251 ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
252 ; CHECK-NEXT: faddp s1, v1.2s
253 ; CHECK-NEXT: fadd s0, s1, s0
254 ; CHECK-NEXT: b.ne .LBB9_1
255 ; CHECK-NEXT: // %bb.2: // %exit
261 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
262 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
263 %red = phi float [ 0.000000e+00, %entry ], [ %red.next, %loop ]
264 %lv = load <4 x float>, ptr %ptr, align 4
265 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %lv)
266 %red.next = fadd fast float %r, %red
267 %ec = icmp eq i32 %iv, 7
268 %ptr.next = getelementptr inbounds float, ptr %ptr, i64 4
269 %iv.next= add nuw nsw i32 %iv, 1
270 br i1 %ec, label %exit, label %loop
276 ; The faddp.4h in the loop should not use v0.4h as second operand,
277 ; because this introduces an unnecessary cross-iteration dependency.
278 define half @fadd_reduction_v4f16_in_loop(ptr %ptr.start) {
279 ; FULLFP16-LABEL: fadd_reduction_v4f16_in_loop:
280 ; FULLFP16: // %bb.0: // %entry
281 ; FULLFP16-NEXT: movi d0, #0000000000000000
282 ; FULLFP16-NEXT: mov x8, xzr
283 ; FULLFP16-NEXT: .LBB10_1: // %loop
284 ; FULLFP16-NEXT: // =>This Inner Loop Header: Depth=1
285 ; FULLFP16-NEXT: ldr d1, [x0, x8]
286 ; FULLFP16-NEXT: add x8, x8, #8
287 ; FULLFP16-NEXT: cmp w8, #56
288 ; FULLFP16-NEXT: faddp v1.4h, v1.4h, v1.4h
289 ; FULLFP16-NEXT: faddp h1, v1.2h
290 ; FULLFP16-NEXT: fadd h0, h1, h0
291 ; FULLFP16-NEXT: b.ne .LBB10_1
292 ; FULLFP16-NEXT: // %bb.2: // %exit
295 ; CHECKNOFP16-LABEL: fadd_reduction_v4f16_in_loop:
296 ; CHECKNOFP16: // %bb.0: // %entry
297 ; CHECKNOFP16-NEXT: movi d0, #0000000000000000
298 ; CHECKNOFP16-NEXT: mov x8, xzr
299 ; CHECKNOFP16-NEXT: .LBB10_1: // %loop
300 ; CHECKNOFP16-NEXT: // =>This Inner Loop Header: Depth=1
301 ; CHECKNOFP16-NEXT: ldr d1, [x0, x8]
302 ; CHECKNOFP16-NEXT: fcvt s0, h0
303 ; CHECKNOFP16-NEXT: add x8, x8, #8
304 ; CHECKNOFP16-NEXT: cmp w8, #56
305 ; CHECKNOFP16-NEXT: mov h2, v1.h[1]
306 ; CHECKNOFP16-NEXT: fcvt s3, h1
307 ; CHECKNOFP16-NEXT: fcvt s2, h2
308 ; CHECKNOFP16-NEXT: fadd s2, s3, s2
309 ; CHECKNOFP16-NEXT: mov h3, v1.h[2]
310 ; CHECKNOFP16-NEXT: mov h1, v1.h[3]
311 ; CHECKNOFP16-NEXT: fcvt h2, s2
312 ; CHECKNOFP16-NEXT: fcvt s3, h3
313 ; CHECKNOFP16-NEXT: fcvt s1, h1
314 ; CHECKNOFP16-NEXT: fcvt s2, h2
315 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
316 ; CHECKNOFP16-NEXT: fcvt h2, s2
317 ; CHECKNOFP16-NEXT: fcvt s2, h2
318 ; CHECKNOFP16-NEXT: fadd s1, s2, s1
319 ; CHECKNOFP16-NEXT: fcvt h1, s1
320 ; CHECKNOFP16-NEXT: fcvt s1, h1
321 ; CHECKNOFP16-NEXT: fadd s0, s1, s0
322 ; CHECKNOFP16-NEXT: fcvt h0, s0
323 ; CHECKNOFP16-NEXT: b.ne .LBB10_1
324 ; CHECKNOFP16-NEXT: // %bb.2: // %exit
325 ; CHECKNOFP16-NEXT: ret
330 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
331 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
332 %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ]
333 %lv = load <4 x half>, ptr %ptr, align 4
334 %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %lv)
335 %red.next = fadd fast half %r, %red
336 %ec = icmp eq i32 %iv, 7
337 %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4
338 %iv.next= add nuw nsw i32 %iv, 1
339 br i1 %ec, label %exit, label %loop
345 ; The faddp.8h in the loop should not use v0.8h as second operand,
346 ; because this introduces an unnecessary cross-iteration dependency.
347 define half @fadd_reduction_v8f16_in_loop(ptr %ptr.start) {
348 ; FULLFP16-LABEL: fadd_reduction_v8f16_in_loop:
349 ; FULLFP16: // %bb.0: // %entry
350 ; FULLFP16-NEXT: movi d0, #0000000000000000
351 ; FULLFP16-NEXT: mov x8, xzr
352 ; FULLFP16-NEXT: .LBB11_1: // %loop
353 ; FULLFP16-NEXT: // =>This Inner Loop Header: Depth=1
354 ; FULLFP16-NEXT: ldr q1, [x0, x8]
355 ; FULLFP16-NEXT: add x8, x8, #8
356 ; FULLFP16-NEXT: cmp w8, #56
357 ; FULLFP16-NEXT: faddp v2.8h, v1.8h, v1.8h
358 ; FULLFP16-NEXT: faddp v1.8h, v2.8h, v1.8h
359 ; FULLFP16-NEXT: faddp h1, v1.2h
360 ; FULLFP16-NEXT: fadd h0, h1, h0
361 ; FULLFP16-NEXT: b.ne .LBB11_1
362 ; FULLFP16-NEXT: // %bb.2: // %exit
365 ; CHECKNOFP16-LABEL: fadd_reduction_v8f16_in_loop:
366 ; CHECKNOFP16: // %bb.0: // %entry
367 ; CHECKNOFP16-NEXT: movi d0, #0000000000000000
368 ; CHECKNOFP16-NEXT: mov x8, xzr
369 ; CHECKNOFP16-NEXT: .LBB11_1: // %loop
370 ; CHECKNOFP16-NEXT: // =>This Inner Loop Header: Depth=1
371 ; CHECKNOFP16-NEXT: ldr q1, [x0, x8]
372 ; CHECKNOFP16-NEXT: fcvt s0, h0
373 ; CHECKNOFP16-NEXT: add x8, x8, #8
374 ; CHECKNOFP16-NEXT: cmp w8, #56
375 ; CHECKNOFP16-NEXT: mov h2, v1.h[1]
376 ; CHECKNOFP16-NEXT: fcvt s3, h1
377 ; CHECKNOFP16-NEXT: fcvt s2, h2
378 ; CHECKNOFP16-NEXT: fadd s2, s3, s2
379 ; CHECKNOFP16-NEXT: mov h3, v1.h[2]
380 ; CHECKNOFP16-NEXT: fcvt h2, s2
381 ; CHECKNOFP16-NEXT: fcvt s3, h3
382 ; CHECKNOFP16-NEXT: fcvt s2, h2
383 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
384 ; CHECKNOFP16-NEXT: mov h3, v1.h[3]
385 ; CHECKNOFP16-NEXT: fcvt h2, s2
386 ; CHECKNOFP16-NEXT: fcvt s3, h3
387 ; CHECKNOFP16-NEXT: fcvt s2, h2
388 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
389 ; CHECKNOFP16-NEXT: mov h3, v1.h[4]
390 ; CHECKNOFP16-NEXT: fcvt h2, s2
391 ; CHECKNOFP16-NEXT: fcvt s3, h3
392 ; CHECKNOFP16-NEXT: fcvt s2, h2
393 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
394 ; CHECKNOFP16-NEXT: mov h3, v1.h[5]
395 ; CHECKNOFP16-NEXT: fcvt h2, s2
396 ; CHECKNOFP16-NEXT: fcvt s3, h3
397 ; CHECKNOFP16-NEXT: fcvt s2, h2
398 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
399 ; CHECKNOFP16-NEXT: mov h3, v1.h[6]
400 ; CHECKNOFP16-NEXT: mov h1, v1.h[7]
401 ; CHECKNOFP16-NEXT: fcvt h2, s2
402 ; CHECKNOFP16-NEXT: fcvt s3, h3
403 ; CHECKNOFP16-NEXT: fcvt s1, h1
404 ; CHECKNOFP16-NEXT: fcvt s2, h2
405 ; CHECKNOFP16-NEXT: fadd s2, s2, s3
406 ; CHECKNOFP16-NEXT: fcvt h2, s2
407 ; CHECKNOFP16-NEXT: fcvt s2, h2
408 ; CHECKNOFP16-NEXT: fadd s1, s2, s1
409 ; CHECKNOFP16-NEXT: fcvt h1, s1
410 ; CHECKNOFP16-NEXT: fcvt s1, h1
411 ; CHECKNOFP16-NEXT: fadd s0, s1, s0
412 ; CHECKNOFP16-NEXT: fcvt h0, s0
413 ; CHECKNOFP16-NEXT: b.ne .LBB11_1
414 ; CHECKNOFP16-NEXT: // %bb.2: // %exit
415 ; CHECKNOFP16-NEXT: ret
420 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
421 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
422 %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ]
423 %lv = load <8 x half>, ptr %ptr, align 4
424 %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %lv)
425 %red.next = fadd fast half %r, %red
426 %ec = icmp eq i32 %iv, 7
427 %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4
428 %iv.next= add nuw nsw i32 %iv, 1
429 br i1 %ec, label %exit, label %loop
436 define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
437 ; FULLFP16-LABEL: fadd_reduct_reassoc_v8f16:
438 ; FULLFP16: // %bb.0:
439 ; FULLFP16-NEXT: fadd v0.8h, v0.8h, v1.8h
440 ; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h
441 ; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h
442 ; FULLFP16-NEXT: faddp h0, v0.2h
445 ; CHECKNOFP16-LABEL: fadd_reduct_reassoc_v8f16:
446 ; CHECKNOFP16: // %bb.0:
447 ; CHECKNOFP16-NEXT: mov h2, v0.h[1]
448 ; CHECKNOFP16-NEXT: mov h3, v1.h[1]
449 ; CHECKNOFP16-NEXT: fcvt s4, h0
450 ; CHECKNOFP16-NEXT: fcvt s5, h1
451 ; CHECKNOFP16-NEXT: fcvt s2, h2
452 ; CHECKNOFP16-NEXT: fcvt s3, h3
453 ; CHECKNOFP16-NEXT: fadd s2, s4, s2
454 ; CHECKNOFP16-NEXT: fadd s3, s5, s3
455 ; CHECKNOFP16-NEXT: mov h4, v0.h[2]
456 ; CHECKNOFP16-NEXT: mov h5, v1.h[2]
457 ; CHECKNOFP16-NEXT: fcvt h2, s2
458 ; CHECKNOFP16-NEXT: fcvt h3, s3
459 ; CHECKNOFP16-NEXT: fcvt s4, h4
460 ; CHECKNOFP16-NEXT: fcvt s5, h5
461 ; CHECKNOFP16-NEXT: fcvt s2, h2
462 ; CHECKNOFP16-NEXT: fcvt s3, h3
463 ; CHECKNOFP16-NEXT: fadd s2, s2, s4
464 ; CHECKNOFP16-NEXT: fadd s3, s3, s5
465 ; CHECKNOFP16-NEXT: mov h4, v0.h[3]
466 ; CHECKNOFP16-NEXT: mov h5, v1.h[3]
467 ; CHECKNOFP16-NEXT: fcvt h2, s2
468 ; CHECKNOFP16-NEXT: fcvt h3, s3
469 ; CHECKNOFP16-NEXT: fcvt s4, h4
470 ; CHECKNOFP16-NEXT: fcvt s5, h5
471 ; CHECKNOFP16-NEXT: fcvt s2, h2
472 ; CHECKNOFP16-NEXT: fcvt s3, h3
473 ; CHECKNOFP16-NEXT: fadd s2, s2, s4
474 ; CHECKNOFP16-NEXT: fadd s3, s3, s5
475 ; CHECKNOFP16-NEXT: mov h4, v0.h[4]
476 ; CHECKNOFP16-NEXT: mov h5, v1.h[4]
477 ; CHECKNOFP16-NEXT: fcvt h2, s2
478 ; CHECKNOFP16-NEXT: fcvt h3, s3
479 ; CHECKNOFP16-NEXT: fcvt s4, h4
480 ; CHECKNOFP16-NEXT: fcvt s5, h5
481 ; CHECKNOFP16-NEXT: fcvt s2, h2
482 ; CHECKNOFP16-NEXT: fcvt s3, h3
483 ; CHECKNOFP16-NEXT: fadd s2, s2, s4
484 ; CHECKNOFP16-NEXT: fadd s3, s3, s5
485 ; CHECKNOFP16-NEXT: mov h4, v0.h[5]
486 ; CHECKNOFP16-NEXT: mov h5, v1.h[5]
487 ; CHECKNOFP16-NEXT: fcvt h2, s2
488 ; CHECKNOFP16-NEXT: fcvt h3, s3
489 ; CHECKNOFP16-NEXT: fcvt s4, h4
490 ; CHECKNOFP16-NEXT: fcvt s5, h5
491 ; CHECKNOFP16-NEXT: fcvt s2, h2
492 ; CHECKNOFP16-NEXT: fcvt s3, h3
493 ; CHECKNOFP16-NEXT: fadd s2, s2, s4
494 ; CHECKNOFP16-NEXT: fadd s3, s3, s5
495 ; CHECKNOFP16-NEXT: mov h4, v0.h[6]
496 ; CHECKNOFP16-NEXT: mov h5, v1.h[6]
497 ; CHECKNOFP16-NEXT: mov h1, v1.h[7]
498 ; CHECKNOFP16-NEXT: mov h0, v0.h[7]
499 ; CHECKNOFP16-NEXT: fcvt h2, s2
500 ; CHECKNOFP16-NEXT: fcvt h3, s3
501 ; CHECKNOFP16-NEXT: fcvt s4, h4
502 ; CHECKNOFP16-NEXT: fcvt s5, h5
503 ; CHECKNOFP16-NEXT: fcvt s0, h0
504 ; CHECKNOFP16-NEXT: fcvt s1, h1
505 ; CHECKNOFP16-NEXT: fcvt s2, h2
506 ; CHECKNOFP16-NEXT: fcvt s3, h3
507 ; CHECKNOFP16-NEXT: fadd s2, s2, s4
508 ; CHECKNOFP16-NEXT: fadd s3, s3, s5
509 ; CHECKNOFP16-NEXT: fcvt h2, s2
510 ; CHECKNOFP16-NEXT: fcvt h3, s3
511 ; CHECKNOFP16-NEXT: fcvt s2, h2
512 ; CHECKNOFP16-NEXT: fcvt s3, h3
513 ; CHECKNOFP16-NEXT: fadd s0, s2, s0
514 ; CHECKNOFP16-NEXT: fadd s1, s3, s1
515 ; CHECKNOFP16-NEXT: fcvt h0, s0
516 ; CHECKNOFP16-NEXT: fcvt h1, s1
517 ; CHECKNOFP16-NEXT: fcvt s1, h1
518 ; CHECKNOFP16-NEXT: fcvt s0, h0
519 ; CHECKNOFP16-NEXT: fadd s0, s0, s1
520 ; CHECKNOFP16-NEXT: fcvt h0, s0
521 ; CHECKNOFP16-NEXT: ret
522 %r1 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %a)
523 %r2 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %b)
524 %r = fadd fast half %r1, %r2
528 define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
529 ; CHECK-LABEL: fadd_reduct_reassoc_v8f32:
531 ; CHECK-NEXT: fadd v2.4s, v2.4s, v3.4s
532 ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
533 ; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
534 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
535 ; CHECK-NEXT: faddp s0, v0.2s
537 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
538 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
539 %r = fadd fast float %r1, %r2
543 define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
544 ; CHECK-LABEL: fadd_reduct_reassoc_v4f32:
546 ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
547 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
548 ; CHECK-NEXT: faddp s0, v0.2s
550 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
551 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
552 %r = fadd fast float %r1, %r2
556 define float @fadd_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) {
557 ; CHECK-LABEL: fadd_reduct_reassoc_v4f32_init:
559 ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
560 ; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s
561 ; CHECK-NEXT: faddp s1, v1.2s
562 ; CHECK-NEXT: fadd s0, s0, s1
563 ; CHECK-NEXT: faddp s1, v2.2s
564 ; CHECK-NEXT: fadd s0, s0, s1
566 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %i, <4 x float> %a)
567 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
568 %r = fadd fast float %r1, %r2
572 define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
573 ; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32:
575 ; CHECK-NEXT: fadd v1.4s, v1.4s, v2.4s
576 ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
577 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
578 ; CHECK-NEXT: faddp s0, v0.2s
580 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
581 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
582 %r = fadd fast float %r1, %r2
586 define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
587 ; CHECK-LABEL: fadd_reduct_reassoc_v4f64:
589 ; CHECK-NEXT: fadd v2.2d, v2.2d, v3.2d
590 ; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
591 ; CHECK-NEXT: fadd v0.2d, v0.2d, v2.2d
592 ; CHECK-NEXT: faddp d0, v0.2d
594 %r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a)
595 %r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b)
596 %r = fadd fast double %r1, %r2
600 define float @fadd_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) {
601 ; CHECK-LABEL: fadd_reduct_reassoc_v4f32_extrause:
603 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
604 ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
605 ; CHECK-NEXT: faddp s0, v0.2s
606 ; CHECK-NEXT: faddp s1, v1.2s
607 ; CHECK-NEXT: fadd s1, s0, s1
608 ; CHECK-NEXT: fmul s0, s1, s0
610 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
611 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
612 %r = fadd fast float %r1, %r2
613 %p = fmul float %r, %r1
617 ; Function Attrs: nounwind readnone
618 declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
619 declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
620 declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
621 declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
622 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
623 declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
624 declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
625 declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)