1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
3 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
4 ; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
5 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
7 define float @add_HalfS(<2 x float> %bin.rdx) {
8 ; CHECK-LABEL: add_HalfS:
10 ; CHECK-NEXT: faddp s0, v0.2s
12 %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
16 define half @add_HalfH(<4 x half> %bin.rdx) {
17 ; CHECK-SD-NOFP16-LABEL: add_HalfH:
18 ; CHECK-SD-NOFP16: // %bb.0:
19 ; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
20 ; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
21 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
22 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
23 ; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1
24 ; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2]
25 ; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[3]
26 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
27 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
28 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
29 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
30 ; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
31 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
32 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
33 ; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
34 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
35 ; CHECK-SD-NOFP16-NEXT: ret
37 ; CHECK-SD-FP16-LABEL: add_HalfH:
38 ; CHECK-SD-FP16: // %bb.0:
39 ; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h
40 ; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
41 ; CHECK-SD-FP16-NEXT: ret
43 ; CHECK-GI-NOFP16-LABEL: add_HalfH:
44 ; CHECK-GI-NOFP16: // %bb.0:
45 ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
46 ; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
47 ; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
48 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
49 ; CHECK-GI-NOFP16-NEXT: ret
51 ; CHECK-GI-FP16-LABEL: add_HalfH:
52 ; CHECK-GI-FP16: // %bb.0:
53 ; CHECK-GI-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h
54 ; CHECK-GI-FP16-NEXT: faddp h0, v0.2h
55 ; CHECK-GI-FP16-NEXT: ret
56 %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
61 define half @add_H(<8 x half> %bin.rdx) {
62 ; CHECK-SD-NOFP16-LABEL: add_H:
63 ; CHECK-SD-NOFP16: // %bb.0:
64 ; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
65 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
66 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
67 ; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1
68 ; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2]
69 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
70 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
71 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
72 ; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
73 ; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[3]
74 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
75 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
76 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
77 ; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
78 ; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[4]
79 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
80 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
81 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
82 ; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
83 ; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[5]
84 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
85 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
86 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
87 ; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
88 ; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6]
89 ; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
90 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
91 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
92 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
93 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
94 ; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
95 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
96 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
97 ; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
98 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
99 ; CHECK-SD-NOFP16-NEXT: ret
101 ; CHECK-SD-FP16-LABEL: add_H:
102 ; CHECK-SD-FP16: // %bb.0:
103 ; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
104 ; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
105 ; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
106 ; CHECK-SD-FP16-NEXT: ret
108 ; CHECK-GI-NOFP16-LABEL: add_H:
109 ; CHECK-GI-NOFP16: // %bb.0:
110 ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
111 ; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
112 ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s
113 ; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
114 ; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
115 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
116 ; CHECK-GI-NOFP16-NEXT: ret
118 ; CHECK-GI-FP16-LABEL: add_H:
119 ; CHECK-GI-FP16: // %bb.0:
120 ; CHECK-GI-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
121 ; CHECK-GI-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
122 ; CHECK-GI-FP16-NEXT: faddp h0, v0.2h
123 ; CHECK-GI-FP16-NEXT: ret
124 %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
128 define float @add_S(<4 x float> %bin.rdx) {
129 ; CHECK-LABEL: add_S:
131 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
132 ; CHECK-NEXT: faddp s0, v0.2s
134 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
138 define double @add_D(<2 x double> %bin.rdx) {
139 ; CHECK-LABEL: add_D:
141 ; CHECK-NEXT: faddp d0, v0.2d
143 %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
147 define half @add_2H(<16 x half> %bin.rdx) {
148 ; CHECK-SD-NOFP16-LABEL: add_2H:
149 ; CHECK-SD-NOFP16: // %bb.0:
150 ; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
151 ; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
152 ; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
153 ; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
154 ; CHECK-SD-NOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s
155 ; CHECK-SD-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
156 ; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
157 ; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
158 ; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
159 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
160 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
161 ; CHECK-SD-NOFP16-NEXT: fadd s0, s2, s0
162 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
163 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
164 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
165 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
166 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
167 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
168 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
169 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
170 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
171 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
172 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
173 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
174 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
175 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
176 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
177 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
178 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
179 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
180 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
181 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
182 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
183 ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
184 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
185 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
186 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
187 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
188 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
189 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
190 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
191 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1
192 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
193 ; CHECK-SD-NOFP16-NEXT: ret
195 ; CHECK-SD-FP16-LABEL: add_2H:
196 ; CHECK-SD-FP16: // %bb.0:
197 ; CHECK-SD-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h
198 ; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
199 ; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
200 ; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
201 ; CHECK-SD-FP16-NEXT: ret
203 ; CHECK-GI-NOFP16-LABEL: add_2H:
204 ; CHECK-GI-NOFP16: // %bb.0:
205 ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
206 ; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
207 ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
208 ; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
209 ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v2.4s, v0.4s
210 ; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v3.4s, v1.4s
211 ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
212 ; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
213 ; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
214 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
215 ; CHECK-GI-NOFP16-NEXT: ret
217 ; CHECK-GI-FP16-LABEL: add_2H:
218 ; CHECK-GI-FP16: // %bb.0:
219 ; CHECK-GI-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h
220 ; CHECK-GI-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
221 ; CHECK-GI-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
222 ; CHECK-GI-FP16-NEXT: faddp h0, v0.2h
223 ; CHECK-GI-FP16-NEXT: ret
224 %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx)
228 define float @add_2S(<8 x float> %bin.rdx) {
229 ; CHECK-LABEL: add_2S:
231 ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
232 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
233 ; CHECK-NEXT: faddp s0, v0.2s
235 %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
239 define double @add_2D(<4 x double> %bin.rdx) {
240 ; CHECK-LABEL: add_2D:
242 ; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
243 ; CHECK-NEXT: faddp d0, v0.2d
245 %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
249 ; Added at least one test where the start value is not -0.0.
250 define float @add_S_init_42(<4 x float> %bin.rdx) {
251 ; CHECK-LABEL: add_S_init_42:
253 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
254 ; CHECK-NEXT: mov w8, #1109917696 // =0x42280000
255 ; CHECK-NEXT: fmov s1, w8
256 ; CHECK-NEXT: faddp s0, v0.2s
257 ; CHECK-NEXT: fadd s0, s0, s1
259 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
263 ; The faddp.4s in the loop should not use v0.4s as second operand,
264 ; because this introduces an unnecessary cross-iteration dependency.
265 define float @fadd_reduction_v4f32_in_loop(ptr %ptr.start) {
266 ; CHECK-LABEL: fadd_reduction_v4f32_in_loop:
267 ; CHECK: // %bb.0: // %entry
268 ; CHECK-NEXT: movi d0, #0000000000000000
269 ; CHECK-NEXT: mov x8, xzr
270 ; CHECK-NEXT: .LBB9_1: // %loop
271 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
272 ; CHECK-NEXT: ldr q1, [x0, x8]
273 ; CHECK-NEXT: add x8, x8, #16
274 ; CHECK-NEXT: cmp w8, #112
275 ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
276 ; CHECK-NEXT: faddp s1, v1.2s
277 ; CHECK-NEXT: fadd s0, s1, s0
278 ; CHECK-NEXT: b.ne .LBB9_1
279 ; CHECK-NEXT: // %bb.2: // %exit
285 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
286 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
287 %red = phi float [ 0.000000e+00, %entry ], [ %red.next, %loop ]
288 %lv = load <4 x float>, ptr %ptr, align 4
289 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %lv)
290 %red.next = fadd fast float %r, %red
291 %ec = icmp eq i32 %iv, 7
292 %ptr.next = getelementptr inbounds float, ptr %ptr, i64 4
293 %iv.next= add nuw nsw i32 %iv, 1
294 br i1 %ec, label %exit, label %loop
300 ; The faddp.4h in the loop should not use v0.4h as second operand,
301 ; because this introduces an unnecessary cross-iteration dependency.
302 define half @fadd_reduction_v4f16_in_loop(ptr %ptr.start) {
303 ; CHECK-SD-NOFP16-LABEL: fadd_reduction_v4f16_in_loop:
304 ; CHECK-SD-NOFP16: // %bb.0: // %entry
305 ; CHECK-SD-NOFP16-NEXT: movi d0, #0000000000000000
306 ; CHECK-SD-NOFP16-NEXT: mov x8, xzr
307 ; CHECK-SD-NOFP16-NEXT: .LBB10_1: // %loop
308 ; CHECK-SD-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
309 ; CHECK-SD-NOFP16-NEXT: ldr d1, [x0, x8]
310 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
311 ; CHECK-SD-NOFP16-NEXT: add x8, x8, #8
312 ; CHECK-SD-NOFP16-NEXT: cmp w8, #56
313 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
314 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h1
315 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
316 ; CHECK-SD-NOFP16-NEXT: fadd s2, s3, s2
317 ; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[2]
318 ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[3]
319 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
320 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
321 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
322 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
323 ; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
324 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
325 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
326 ; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1
327 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
328 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
329 ; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
330 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
331 ; CHECK-SD-NOFP16-NEXT: b.ne .LBB10_1
332 ; CHECK-SD-NOFP16-NEXT: // %bb.2: // %exit
333 ; CHECK-SD-NOFP16-NEXT: ret
335 ; CHECK-SD-FP16-LABEL: fadd_reduction_v4f16_in_loop:
336 ; CHECK-SD-FP16: // %bb.0: // %entry
337 ; CHECK-SD-FP16-NEXT: movi d0, #0000000000000000
338 ; CHECK-SD-FP16-NEXT: mov x8, xzr
339 ; CHECK-SD-FP16-NEXT: .LBB10_1: // %loop
340 ; CHECK-SD-FP16-NEXT: // =>This Inner Loop Header: Depth=1
341 ; CHECK-SD-FP16-NEXT: ldr d1, [x0, x8]
342 ; CHECK-SD-FP16-NEXT: add x8, x8, #8
343 ; CHECK-SD-FP16-NEXT: cmp w8, #56
344 ; CHECK-SD-FP16-NEXT: faddp v1.4h, v1.4h, v1.4h
345 ; CHECK-SD-FP16-NEXT: faddp h1, v1.2h
346 ; CHECK-SD-FP16-NEXT: fadd h0, h1, h0
347 ; CHECK-SD-FP16-NEXT: b.ne .LBB10_1
348 ; CHECK-SD-FP16-NEXT: // %bb.2: // %exit
349 ; CHECK-SD-FP16-NEXT: ret
351 ; CHECK-GI-NOFP16-LABEL: fadd_reduction_v4f16_in_loop:
352 ; CHECK-GI-NOFP16: // %bb.0: // %entry
353 ; CHECK-GI-NOFP16-NEXT: mov x8, xzr
354 ; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0
355 ; CHECK-GI-NOFP16-NEXT: .LBB10_1: // %loop
356 ; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
357 ; CHECK-GI-NOFP16-NEXT: ldr d0, [x0, x8]
358 ; CHECK-GI-NOFP16-NEXT: fmov s1, w9
359 ; CHECK-GI-NOFP16-NEXT: add x8, x8, #8
360 ; CHECK-GI-NOFP16-NEXT: cmp w8, #56
361 ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
362 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
363 ; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
364 ; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
365 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
366 ; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
367 ; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
368 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
369 ; CHECK-GI-NOFP16-NEXT: fmov w9, s0
370 ; CHECK-GI-NOFP16-NEXT: b.ne .LBB10_1
371 ; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit
372 ; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
373 ; CHECK-GI-NOFP16-NEXT: ret
375 ; CHECK-GI-FP16-LABEL: fadd_reduction_v4f16_in_loop:
376 ; CHECK-GI-FP16: // %bb.0: // %entry
377 ; CHECK-GI-FP16-NEXT: movi d0, #0000000000000000
378 ; CHECK-GI-FP16-NEXT: mov x8, xzr
379 ; CHECK-GI-FP16-NEXT: .LBB10_1: // %loop
380 ; CHECK-GI-FP16-NEXT: // =>This Inner Loop Header: Depth=1
381 ; CHECK-GI-FP16-NEXT: ldr d1, [x0, x8]
382 ; CHECK-GI-FP16-NEXT: add x8, x8, #8
383 ; CHECK-GI-FP16-NEXT: cmp w8, #56
384 ; CHECK-GI-FP16-NEXT: faddp v1.4h, v1.4h, v1.4h
385 ; CHECK-GI-FP16-NEXT: faddp h1, v1.2h
386 ; CHECK-GI-FP16-NEXT: fadd h0, h1, h0
387 ; CHECK-GI-FP16-NEXT: b.ne .LBB10_1
388 ; CHECK-GI-FP16-NEXT: // %bb.2: // %exit
389 ; CHECK-GI-FP16-NEXT: ret
394 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
395 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
396 %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ]
397 %lv = load <4 x half>, ptr %ptr, align 4
398 %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %lv)
399 %red.next = fadd fast half %r, %red
400 %ec = icmp eq i32 %iv, 7
401 %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4
402 %iv.next= add nuw nsw i32 %iv, 1
403 br i1 %ec, label %exit, label %loop
409 ; The faddp.8h in the loop should not use v0.8h as second operand,
410 ; because this introduces an unnecessary cross-iteration dependency.
411 define half @fadd_reduction_v8f16_in_loop(ptr %ptr.start) {
412 ; CHECK-SD-NOFP16-LABEL: fadd_reduction_v8f16_in_loop:
413 ; CHECK-SD-NOFP16: // %bb.0: // %entry
414 ; CHECK-SD-NOFP16-NEXT: movi d0, #0000000000000000
415 ; CHECK-SD-NOFP16-NEXT: mov x8, xzr
416 ; CHECK-SD-NOFP16-NEXT: .LBB11_1: // %loop
417 ; CHECK-SD-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
418 ; CHECK-SD-NOFP16-NEXT: ldr q1, [x0, x8]
419 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
420 ; CHECK-SD-NOFP16-NEXT: add x8, x8, #8
421 ; CHECK-SD-NOFP16-NEXT: cmp w8, #56
422 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
423 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h1
424 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
425 ; CHECK-SD-NOFP16-NEXT: fadd s2, s3, s2
426 ; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[2]
427 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
428 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
429 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
430 ; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
431 ; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[3]
432 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
433 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
434 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
435 ; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
436 ; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[4]
437 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
438 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
439 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
440 ; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
441 ; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[5]
442 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
443 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
444 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
445 ; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
446 ; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[6]
447 ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
448 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
449 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
450 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
451 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
452 ; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
453 ; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
454 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
455 ; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1
456 ; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
457 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
458 ; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
459 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
460 ; CHECK-SD-NOFP16-NEXT: b.ne .LBB11_1
461 ; CHECK-SD-NOFP16-NEXT: // %bb.2: // %exit
462 ; CHECK-SD-NOFP16-NEXT: ret
464 ; CHECK-SD-FP16-LABEL: fadd_reduction_v8f16_in_loop:
465 ; CHECK-SD-FP16: // %bb.0: // %entry
466 ; CHECK-SD-FP16-NEXT: movi d0, #0000000000000000
467 ; CHECK-SD-FP16-NEXT: mov x8, xzr
468 ; CHECK-SD-FP16-NEXT: .LBB11_1: // %loop
469 ; CHECK-SD-FP16-NEXT: // =>This Inner Loop Header: Depth=1
470 ; CHECK-SD-FP16-NEXT: ldr q1, [x0, x8]
471 ; CHECK-SD-FP16-NEXT: add x8, x8, #8
472 ; CHECK-SD-FP16-NEXT: cmp w8, #56
473 ; CHECK-SD-FP16-NEXT: faddp v2.8h, v1.8h, v1.8h
474 ; CHECK-SD-FP16-NEXT: faddp v1.8h, v2.8h, v1.8h
475 ; CHECK-SD-FP16-NEXT: faddp h1, v1.2h
476 ; CHECK-SD-FP16-NEXT: fadd h0, h1, h0
477 ; CHECK-SD-FP16-NEXT: b.ne .LBB11_1
478 ; CHECK-SD-FP16-NEXT: // %bb.2: // %exit
479 ; CHECK-SD-FP16-NEXT: ret
481 ; CHECK-GI-NOFP16-LABEL: fadd_reduction_v8f16_in_loop:
482 ; CHECK-GI-NOFP16: // %bb.0: // %entry
483 ; CHECK-GI-NOFP16-NEXT: mov x8, xzr
484 ; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0
485 ; CHECK-GI-NOFP16-NEXT: .LBB11_1: // %loop
486 ; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
487 ; CHECK-GI-NOFP16-NEXT: ldr q0, [x0, x8]
488 ; CHECK-GI-NOFP16-NEXT: add x8, x8, #8
489 ; CHECK-GI-NOFP16-NEXT: cmp w8, #56
490 ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
491 ; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
492 ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s
493 ; CHECK-GI-NOFP16-NEXT: fmov s1, w9
494 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
495 ; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
496 ; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
497 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
498 ; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
499 ; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
500 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
501 ; CHECK-GI-NOFP16-NEXT: fmov w9, s0
502 ; CHECK-GI-NOFP16-NEXT: b.ne .LBB11_1
503 ; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit
504 ; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
505 ; CHECK-GI-NOFP16-NEXT: ret
507 ; CHECK-GI-FP16-LABEL: fadd_reduction_v8f16_in_loop:
508 ; CHECK-GI-FP16: // %bb.0: // %entry
509 ; CHECK-GI-FP16-NEXT: movi d0, #0000000000000000
510 ; CHECK-GI-FP16-NEXT: mov x8, xzr
511 ; CHECK-GI-FP16-NEXT: .LBB11_1: // %loop
512 ; CHECK-GI-FP16-NEXT: // =>This Inner Loop Header: Depth=1
513 ; CHECK-GI-FP16-NEXT: ldr q1, [x0, x8]
514 ; CHECK-GI-FP16-NEXT: add x8, x8, #8
515 ; CHECK-GI-FP16-NEXT: cmp w8, #56
516 ; CHECK-GI-FP16-NEXT: faddp v2.8h, v1.8h, v1.8h
517 ; CHECK-GI-FP16-NEXT: faddp v1.8h, v2.8h, v1.8h
518 ; CHECK-GI-FP16-NEXT: faddp h1, v1.2h
519 ; CHECK-GI-FP16-NEXT: fadd h0, h1, h0
520 ; CHECK-GI-FP16-NEXT: b.ne .LBB11_1
521 ; CHECK-GI-FP16-NEXT: // %bb.2: // %exit
522 ; CHECK-GI-FP16-NEXT: ret
527 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
528 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ]
529 %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ]
530 %lv = load <8 x half>, ptr %ptr, align 4
531 %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %lv)
532 %red.next = fadd fast half %r, %red
533 %ec = icmp eq i32 %iv, 7
534 %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4
535 %iv.next= add nuw nsw i32 %iv, 1
536 br i1 %ec, label %exit, label %loop
543 define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
544 ; CHECK-SD-NOFP16-LABEL: fadd_reduct_reassoc_v8f16:
545 ; CHECK-SD-NOFP16: // %bb.0:
546 ; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
547 ; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
548 ; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
549 ; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
550 ; CHECK-SD-NOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s
551 ; CHECK-SD-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
552 ; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
553 ; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
554 ; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
555 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
556 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
557 ; CHECK-SD-NOFP16-NEXT: fadd s0, s2, s0
558 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
559 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
560 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
561 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
562 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
563 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
564 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
565 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
566 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
567 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
568 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
569 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
570 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
571 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
572 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
573 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
574 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
575 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
576 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
577 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
578 ; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
579 ; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
580 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
581 ; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
582 ; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
583 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
584 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
585 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
586 ; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
587 ; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1
588 ; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
589 ; CHECK-SD-NOFP16-NEXT: ret
591 ; CHECK-SD-FP16-LABEL: fadd_reduct_reassoc_v8f16:
592 ; CHECK-SD-FP16: // %bb.0:
593 ; CHECK-SD-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h
594 ; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
595 ; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
596 ; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
597 ; CHECK-SD-FP16-NEXT: ret
599 ; CHECK-GI-NOFP16-LABEL: fadd_reduct_reassoc_v8f16:
600 ; CHECK-GI-NOFP16: // %bb.0:
601 ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
602 ; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
603 ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
604 ; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
605 ; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v2.4s, v0.4s
606 ; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v3.4s, v1.4s
607 ; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
608 ; CHECK-GI-NOFP16-NEXT: faddp v1.4s, v1.4s, v1.4s
609 ; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
610 ; CHECK-GI-NOFP16-NEXT: faddp s1, v1.2s
611 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
612 ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
613 ; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
614 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
615 ; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
616 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
617 ; CHECK-GI-NOFP16-NEXT: ret
619 ; CHECK-GI-FP16-LABEL: fadd_reduct_reassoc_v8f16:
620 ; CHECK-GI-FP16: // %bb.0:
621 ; CHECK-GI-FP16-NEXT: faddp v2.8h, v0.8h, v0.8h
622 ; CHECK-GI-FP16-NEXT: faddp v3.8h, v1.8h, v1.8h
623 ; CHECK-GI-FP16-NEXT: faddp v0.8h, v2.8h, v0.8h
624 ; CHECK-GI-FP16-NEXT: faddp v1.8h, v3.8h, v1.8h
625 ; CHECK-GI-FP16-NEXT: faddp h0, v0.2h
626 ; CHECK-GI-FP16-NEXT: faddp h1, v1.2h
627 ; CHECK-GI-FP16-NEXT: fadd h0, h0, h1
628 ; CHECK-GI-FP16-NEXT: ret
629 %r1 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %a)
630 %r2 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %b)
631 %r = fadd fast half %r1, %r2
635 define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
636 ; CHECK-SD-LABEL: fadd_reduct_reassoc_v8f32:
637 ; CHECK-SD: // %bb.0:
638 ; CHECK-SD-NEXT: fadd v2.4s, v2.4s, v3.4s
639 ; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s
640 ; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s
641 ; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s
642 ; CHECK-SD-NEXT: faddp s0, v0.2s
645 ; CHECK-GI-LABEL: fadd_reduct_reassoc_v8f32:
646 ; CHECK-GI: // %bb.0:
647 ; CHECK-GI-NEXT: fadd v0.4s, v0.4s, v1.4s
648 ; CHECK-GI-NEXT: fadd v1.4s, v2.4s, v3.4s
649 ; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s
650 ; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s
651 ; CHECK-GI-NEXT: faddp s0, v0.2s
652 ; CHECK-GI-NEXT: faddp s1, v1.2s
653 ; CHECK-GI-NEXT: fadd s0, s0, s1
655 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
656 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
657 %r = fadd fast float %r1, %r2
661 define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
662 ; CHECK-SD-LABEL: fadd_reduct_reassoc_v4f32:
663 ; CHECK-SD: // %bb.0:
664 ; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s
665 ; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s
666 ; CHECK-SD-NEXT: faddp s0, v0.2s
669 ; CHECK-GI-LABEL: fadd_reduct_reassoc_v4f32:
670 ; CHECK-GI: // %bb.0:
671 ; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s
672 ; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s
673 ; CHECK-GI-NEXT: faddp s0, v0.2s
674 ; CHECK-GI-NEXT: faddp s1, v1.2s
675 ; CHECK-GI-NEXT: fadd s0, s0, s1
677 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
678 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
679 %r = fadd fast float %r1, %r2
683 define float @fadd_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) {
684 ; CHECK-LABEL: fadd_reduct_reassoc_v4f32_init:
686 ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
687 ; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s
688 ; CHECK-NEXT: faddp s1, v1.2s
689 ; CHECK-NEXT: fadd s0, s0, s1
690 ; CHECK-NEXT: faddp s1, v2.2s
691 ; CHECK-NEXT: fadd s0, s0, s1
693 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %i, <4 x float> %a)
694 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
695 %r = fadd fast float %r1, %r2
699 define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
700 ; CHECK-SD-LABEL: fadd_reduct_reassoc_v4v8f32:
701 ; CHECK-SD: // %bb.0:
702 ; CHECK-SD-NEXT: fadd v1.4s, v1.4s, v2.4s
703 ; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s
704 ; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s
705 ; CHECK-SD-NEXT: faddp s0, v0.2s
708 ; CHECK-GI-LABEL: fadd_reduct_reassoc_v4v8f32:
709 ; CHECK-GI: // %bb.0:
710 ; CHECK-GI-NEXT: fadd v1.4s, v1.4s, v2.4s
711 ; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s
712 ; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s
713 ; CHECK-GI-NEXT: faddp s0, v0.2s
714 ; CHECK-GI-NEXT: faddp s1, v1.2s
715 ; CHECK-GI-NEXT: fadd s0, s0, s1
717 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
718 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
719 %r = fadd fast float %r1, %r2
723 define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
724 ; CHECK-SD-LABEL: fadd_reduct_reassoc_v4f64:
725 ; CHECK-SD: // %bb.0:
726 ; CHECK-SD-NEXT: fadd v2.2d, v2.2d, v3.2d
727 ; CHECK-SD-NEXT: fadd v0.2d, v0.2d, v1.2d
728 ; CHECK-SD-NEXT: fadd v0.2d, v0.2d, v2.2d
729 ; CHECK-SD-NEXT: faddp d0, v0.2d
732 ; CHECK-GI-LABEL: fadd_reduct_reassoc_v4f64:
733 ; CHECK-GI: // %bb.0:
734 ; CHECK-GI-NEXT: fadd v0.2d, v0.2d, v1.2d
735 ; CHECK-GI-NEXT: fadd v1.2d, v2.2d, v3.2d
736 ; CHECK-GI-NEXT: faddp d0, v0.2d
737 ; CHECK-GI-NEXT: faddp d1, v1.2d
738 ; CHECK-GI-NEXT: fadd d0, d0, d1
740 %r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a)
741 %r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b)
742 %r = fadd fast double %r1, %r2
746 define float @fadd_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) {
747 ; CHECK-LABEL: fadd_reduct_reassoc_v4f32_extrause:
749 ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
750 ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
751 ; CHECK-NEXT: faddp s0, v0.2s
752 ; CHECK-NEXT: faddp s1, v1.2s
753 ; CHECK-NEXT: fadd s1, s0, s1
754 ; CHECK-NEXT: fmul s0, s1, s0
756 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
757 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
758 %r = fadd fast float %r1, %r2
759 %p = fmul float %r, %r1
763 ; Function Attrs: nounwind readnone
764 declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
765 declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
766 declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
767 declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
768 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
769 declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
770 declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
771 declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)