1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
4 target triple = "aarch64"
6 %"class.std::complex" = type { { double, double } }
8 ; Zero initialized reduction
10 ; complex<double> x = 0.0 + 0.0i;
11 ; for (int i = 0; i < 100; ++i)
14 define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
15 ; CHECK-LABEL: complex_mul_v2f64:
16 ; CHECK: // %bb.0: // %entry
17 ; CHECK-NEXT: mov z1.d, #0 // =0x0
18 ; CHECK-NEXT: ptrue p1.b
20 ; CHECK-NEXT: ptrue p0.d
21 ; CHECK-NEXT: neg x10, x9
22 ; CHECK-NEXT: mov w11, #100 // =0x64
23 ; CHECK-NEXT: mov x8, xzr
24 ; CHECK-NEXT: and x10, x10, x11
25 ; CHECK-NEXT: rdvl x11, #2
26 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
27 ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
28 ; CHECK-NEXT: .LBB0_1: // %vector.body
29 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
30 ; CHECK-NEXT: add x12, x0, x8
31 ; CHECK-NEXT: add x13, x1, x8
32 ; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
33 ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
34 ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
35 ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
36 ; CHECK-NEXT: subs x10, x10, x9
37 ; CHECK-NEXT: add x8, x8, x11
38 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
39 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
40 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
41 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
42 ; CHECK-NEXT: b.ne .LBB0_1
43 ; CHECK-NEXT: // %bb.2: // %exit.block
44 ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
45 ; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d
46 ; CHECK-NEXT: faddv d0, p0, z2.d
47 ; CHECK-NEXT: faddv d1, p0, z1.d
48 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
49 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1
52 %0 = tail call i64 @llvm.vscale.i64()
53 %1 = shl nuw nsw i64 %0, 1
54 %n.mod.vf = urem i64 100, %1
55 %n.vec = sub nuw nsw i64 100, %n.mod.vf
56 %2 = shl nuw nsw i64 %0, 5
59 vector.body: ; preds = %vector.body, %entry
60 %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ]
61 %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ]
62 %vec.phi = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %16, %vector.body ]
63 %vec.phi12 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %14, %vector.body ]
64 %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
65 %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
66 %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
67 %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
68 %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
69 %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
70 %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
71 %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
72 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
73 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
74 %9 = fmul fast <vscale x 2 x double> %8, %4
75 %10 = fmul fast <vscale x 2 x double> %7, %5
76 %11 = fmul fast <vscale x 2 x double> %7, %4
77 %12 = fadd fast <vscale x 2 x double> %11, %vec.phi12
78 %13 = fmul fast <vscale x 2 x double> %8, %5
79 %14 = fsub fast <vscale x 2 x double> %12, %13
80 %15 = fadd fast <vscale x 2 x double> %10, %vec.phi
81 %16 = fadd fast <vscale x 2 x double> %15, %9
82 %lsr.iv.next28 = add i64 %lsr.iv27, %2
83 %lsr.iv.next32 = sub i64 %lsr.iv31, %1
84 %17 = icmp eq i64 %lsr.iv.next32, 0
85 br i1 %17, label %exit.block, label %vector.body
87 exit.block: ; preds = %vector.body
88 %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %14)
89 %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %16)
90 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0
91 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1
92 ret %"class.std::complex" %.fca.0.1.insert
95 ; Fixed value initialized reduction
97 ; complex<double> x = 2.0 + 1.0i;
98 ; for (int i = 0; i < 100; ++i)
101 define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
102 ; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
103 ; CHECK: // %bb.0: // %entry
104 ; CHECK-NEXT: ptrue p0.d, vl1
105 ; CHECK-NEXT: fmov d0, #1.00000000
106 ; CHECK-NEXT: mov z1.d, #0 // =0x0
107 ; CHECK-NEXT: fmov d2, #2.00000000
108 ; CHECK-NEXT: cntd x9
109 ; CHECK-NEXT: mov w11, #100 // =0x64
110 ; CHECK-NEXT: ptrue p1.b
111 ; CHECK-NEXT: neg x10, x9
112 ; CHECK-NEXT: mov x8, xzr
113 ; CHECK-NEXT: and x10, x10, x11
114 ; CHECK-NEXT: rdvl x11, #2
115 ; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d
116 ; CHECK-NEXT: mov z1.d, p0/m, z2.d
117 ; CHECK-NEXT: ptrue p0.d
118 ; CHECK-NEXT: zip2 z0.d, z1.d, z3.d
119 ; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
120 ; CHECK-NEXT: .LBB1_1: // %vector.body
121 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
122 ; CHECK-NEXT: add x12, x0, x8
123 ; CHECK-NEXT: add x13, x1, x8
124 ; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
125 ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
126 ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
127 ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
128 ; CHECK-NEXT: subs x10, x10, x9
129 ; CHECK-NEXT: add x8, x8, x11
130 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
131 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
132 ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
133 ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
134 ; CHECK-NEXT: b.ne .LBB1_1
135 ; CHECK-NEXT: // %bb.2: // %exit.block
136 ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
137 ; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d
138 ; CHECK-NEXT: faddv d0, p0, z2.d
139 ; CHECK-NEXT: faddv d1, p0, z1.d
140 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
141 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1
144 %0 = tail call i64 @llvm.vscale.i64()
145 %1 = shl nuw nsw i64 %0, 1
146 %n.mod.vf = urem i64 100, %1
147 %n.vec = sub nuw nsw i64 100, %n.mod.vf
148 %2 = shl nuw nsw i64 %0, 5
149 br label %vector.body
151 vector.body: ; preds = %vector.body, %entry
152 %lsr.iv31 = phi i64 [ %lsr.iv.next32, %vector.body ], [ %n.vec, %entry ]
153 %lsr.iv27 = phi i64 [ %lsr.iv.next28, %vector.body ], [ 0, %entry ]
154 %vec.phi = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> zeroinitializer, double 1.000000e+00, i32 0), %entry ], [ %16, %vector.body ]
155 %vec.phi12 = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> zeroinitializer, double 2.000000e+0, i32 0), %entry ], [ %14, %vector.body ]
156 %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
157 %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
158 %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
159 %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
160 %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
161 %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
162 %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
163 %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
164 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
165 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
166 %9 = fmul fast <vscale x 2 x double> %8, %4
167 %10 = fmul fast <vscale x 2 x double> %7, %5
168 %11 = fmul fast <vscale x 2 x double> %7, %4
169 %12 = fadd fast <vscale x 2 x double> %11, %vec.phi12
170 %13 = fmul fast <vscale x 2 x double> %8, %5
171 %14 = fsub fast <vscale x 2 x double> %12, %13
172 %15 = fadd fast <vscale x 2 x double> %10, %vec.phi
173 %16 = fadd fast <vscale x 2 x double> %15, %9
174 %lsr.iv.next28 = add i64 %lsr.iv27, %2
175 %lsr.iv.next32 = sub i64 %lsr.iv31, %1
176 %17 = icmp eq i64 %lsr.iv.next32, 0
177 br i1 %17, label %exit.block, label %vector.body
179 exit.block: ; preds = %vector.body
180 %18 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %14)
181 %19 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %16)
182 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %18, 0, 0
183 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %19, 0, 1
184 ret %"class.std::complex" %.fca.0.1.insert
187 ; Loop unrolled with factor 2
189 define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
190 ; CHECK-LABEL: complex_mul_v2f64_unrolled:
191 ; CHECK: // %bb.0: // %entry
192 ; CHECK-NEXT: mov z1.d, #0 // =0x0
193 ; CHECK-NEXT: ptrue p1.b
194 ; CHECK-NEXT: cntw x9
195 ; CHECK-NEXT: ptrue p0.d
196 ; CHECK-NEXT: neg x10, x9
197 ; CHECK-NEXT: mov w11, #1000 // =0x3e8
198 ; CHECK-NEXT: mov x8, xzr
199 ; CHECK-NEXT: and x10, x10, x11
200 ; CHECK-NEXT: rdvl x11, #4
201 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
202 ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
203 ; CHECK-NEXT: addvl x12, x1, #2
204 ; CHECK-NEXT: addvl x13, x0, #2
205 ; CHECK-NEXT: mov z2.d, z1.d
206 ; CHECK-NEXT: mov z3.d, z0.d
207 ; CHECK-NEXT: .LBB2_1: // %vector.body
208 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
209 ; CHECK-NEXT: add x14, x0, x8
210 ; CHECK-NEXT: add x15, x13, x8
211 ; CHECK-NEXT: add x16, x1, x8
212 ; CHECK-NEXT: add x17, x12, x8
213 ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
214 ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
215 ; CHECK-NEXT: ld1b { z6.b }, p1/z, [x13, x8]
216 ; CHECK-NEXT: ld1d { z7.d }, p0/z, [x15, #1, mul vl]
217 ; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1, x8]
218 ; CHECK-NEXT: ld1d { z17.d }, p0/z, [x16, #1, mul vl]
219 ; CHECK-NEXT: ld1b { z18.b }, p1/z, [x12, x8]
220 ; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
221 ; CHECK-NEXT: subs x10, x10, x9
222 ; CHECK-NEXT: add x8, x8, x11
223 ; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #0
224 ; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #0
225 ; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0
226 ; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z7.d, #0
227 ; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #90
228 ; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #90
229 ; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #90
230 ; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z7.d, #90
231 ; CHECK-NEXT: b.ne .LBB2_1
232 ; CHECK-NEXT: // %bb.2: // %exit.block
233 ; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
234 ; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d
235 ; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d
236 ; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d
237 ; CHECK-NEXT: fadd z1.d, z4.d, z5.d
238 ; CHECK-NEXT: fadd z2.d, z2.d, z0.d
239 ; CHECK-NEXT: faddv d0, p0, z1.d
240 ; CHECK-NEXT: faddv d1, p0, z2.d
241 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
242 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1
245 %0 = tail call i64 @llvm.vscale.i64()
246 %1 = shl nuw nsw i64 %0, 2
247 %n.mod.vf = urem i64 1000, %1
248 %n.vec = sub i64 1000, %n.mod.vf
249 %2 = shl nuw nsw i64 %0, 6
250 %3 = shl nuw nsw i64 %0, 5
251 %scevgep61 = getelementptr i8, ptr %b, i64 %3
252 %scevgep63 = getelementptr i8, ptr %a, i64 %3
253 br label %vector.body
255 vector.body: ; preds = %vector.body, %entry
256 %lsr.iv38 = phi i64 [ %lsr.iv.next39, %vector.body ], [ %n.vec, %entry ]
257 %lsr.iv34 = phi i64 [ %lsr.iv.next35, %vector.body ], [ 0, %entry ]
258 %vec.phi = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %30, %vector.body ]
259 %vec.phi12 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %31, %vector.body ]
260 %vec.phi13 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %26, %vector.body ]
261 %vec.phi14 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %27, %vector.body ]
262 %scevgep57 = getelementptr i8, ptr %a, i64 %lsr.iv34
263 %scevgep64 = getelementptr i8, ptr %scevgep63, i64 %lsr.iv34
264 %scevgep58 = getelementptr i8, ptr %b, i64 %lsr.iv34
265 %scevgep62 = getelementptr i8, ptr %scevgep61, i64 %lsr.iv34
266 %wide.vec = load <vscale x 4 x double>, ptr %scevgep57, align 8
267 %wide.vec32 = load <vscale x 4 x double>, ptr %scevgep64, align 8
268 %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
269 %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
270 %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
271 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 0
272 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
273 %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 1
274 %wide.vec34 = load <vscale x 4 x double>, ptr %scevgep58, align 8
275 %wide.vec35 = load <vscale x 4 x double>, ptr %scevgep62, align 8
276 %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
277 %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
278 %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 0
279 %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
280 %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 1
281 %15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 1
282 %16 = fmul fast <vscale x 2 x double> %14, %6
283 %17 = fmul fast <vscale x 2 x double> %15, %7
284 %18 = fmul fast <vscale x 2 x double> %12, %8
285 %19 = fmul fast <vscale x 2 x double> %13, %9
286 %20 = fmul fast <vscale x 2 x double> %12, %6
287 %21 = fmul fast <vscale x 2 x double> %13, %7
288 %22 = fadd fast <vscale x 2 x double> %20, %vec.phi13
289 %23 = fadd fast <vscale x 2 x double> %21, %vec.phi14
290 %24 = fmul fast <vscale x 2 x double> %14, %8
291 %25 = fmul fast <vscale x 2 x double> %15, %9
292 %26 = fsub fast <vscale x 2 x double> %22, %24
293 %27 = fsub fast <vscale x 2 x double> %23, %25
294 %28 = fadd fast <vscale x 2 x double> %18, %vec.phi
295 %29 = fadd fast <vscale x 2 x double> %19, %vec.phi12
296 %30 = fadd fast <vscale x 2 x double> %28, %16
297 %31 = fadd fast <vscale x 2 x double> %29, %17
298 %lsr.iv.next35 = add i64 %lsr.iv34, %2
299 %lsr.iv.next39 = sub i64 %lsr.iv38, %1
300 %32 = icmp eq i64 %lsr.iv.next39, 0
301 br i1 %32, label %exit.block, label %vector.body
303 exit.block: ; preds = %vector.body
304 %bin.rdx15 = fadd fast <vscale x 2 x double> %27, %26
305 %33 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %bin.rdx15)
306 %bin.rdx = fadd fast <vscale x 2 x double> %31, %30
307 %34 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %bin.rdx)
308 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %33, 0, 0
309 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %34, 0, 1
310 ret %"class.std::complex" %.fca.0.1.insert
313 ; Integer and floating point complex number reduction in the same loop:
314 ; complex<double> *s = ...;
317 ; for (int i = 0; i < N; ++i) {
322 define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalias nocapture noundef readnone %c, [2 x double] %d.coerce, ptr nocapture noundef readonly %s, ptr nocapture noundef writeonly %outs) local_unnamed_addr #0 {
323 ; CHECK-LABEL: reduction_mix:
324 ; CHECK: // %bb.0: // %entry
325 ; CHECK-NEXT: mov z2.d, #0 // =0x0
326 ; CHECK-NEXT: ptrue p0.d
327 ; CHECK-NEXT: cntd x9
328 ; CHECK-NEXT: neg x10, x9
329 ; CHECK-NEXT: mov w11, #100 // =0x64
330 ; CHECK-NEXT: mov x8, xzr
331 ; CHECK-NEXT: and x10, x10, x11
332 ; CHECK-NEXT: rdvl x11, #2
333 ; CHECK-NEXT: zip2 z0.d, z2.d, z2.d
334 ; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
335 ; CHECK-NEXT: .LBB3_1: // %vector.body
336 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
337 ; CHECK-NEXT: ld1w { z3.d }, p0/z, [x3, x8, lsl #2]
338 ; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0]
339 ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #1, mul vl]
340 ; CHECK-NEXT: add x8, x8, x9
341 ; CHECK-NEXT: add x0, x0, x11
342 ; CHECK-NEXT: cmp x10, x8
343 ; CHECK-NEXT: fadd z0.d, z5.d, z0.d
344 ; CHECK-NEXT: fadd z1.d, z4.d, z1.d
345 ; CHECK-NEXT: add z2.d, z3.d, z2.d
346 ; CHECK-NEXT: b.ne .LBB3_1
347 ; CHECK-NEXT: // %bb.2: // %middle.block
348 ; CHECK-NEXT: uaddv d2, p0, z2.d
349 ; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d
350 ; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d
351 ; CHECK-NEXT: fmov x8, d2
352 ; CHECK-NEXT: faddv d0, p0, z3.d
353 ; CHECK-NEXT: faddv d1, p0, z1.d
354 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
355 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1
356 ; CHECK-NEXT: str w8, [x4]
359 %0 = tail call i64 @llvm.vscale.i64()
360 %1 = shl nuw nsw i64 %0, 1
361 %n.mod.vf = urem i64 100, %1
362 %n.vec = sub nuw nsw i64 100, %n.mod.vf
363 %2 = tail call i64 @llvm.vscale.i64()
364 %3 = shl nuw nsw i64 %2, 1
365 br label %vector.body
367 vector.body: ; preds = %vector.body, %entry
368 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
369 %vec.phi = phi <vscale x 2 x i32> [ zeroinitializer, %entry ], [ %5, %vector.body ]
370 %vec.phi13 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %9, %vector.body ]
371 %vec.phi14 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %10, %vector.body ]
372 %4 = getelementptr inbounds i32, ptr %s, i64 %index
373 %wide.load = load <vscale x 2 x i32>, ptr %4, align 4
374 %5 = add <vscale x 2 x i32> %wide.load, %vec.phi
375 %6 = getelementptr inbounds %"class.std::complex", ptr %a, i64 %index
376 %wide.vec = load <vscale x 4 x double>, ptr %6, align 8
377 %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
378 %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
379 %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
380 %9 = fadd fast <vscale x 2 x double> %7, %vec.phi13
381 %10 = fadd fast <vscale x 2 x double> %8, %vec.phi14
382 %index.next = add nuw i64 %index, %3
383 %11 = icmp eq i64 %index.next, %n.vec
384 br i1 %11, label %middle.block, label %vector.body
386 middle.block: ; preds = %vector.body
387 %12 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %10)
388 %13 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, <vscale x 2 x double> %9)
389 %14 = tail call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %5)
390 store i32 %14, ptr %outs, align 4
391 %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %12, 0, 0
392 %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %13, 0, 1
393 ret %"class.std::complex" %.fca.0.1.insert
397 declare i64 @llvm.vscale.i64()
398 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
399 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
400 declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)