1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
4 target triple = "aarch64"
6 %"struct.std::complex" = type { { double, double } }
8 ; Zero initialized reduction
10 ; complex<double> x = 0.0 + 0.0i;
11 ; for (int i = 0; i < 100; ++i)
14 define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
15 ; CHECK-LABEL: complex_mul_v2f64:
16 ; CHECK: // %bb.0: // %entry
17 ; CHECK-NEXT: movi v0.2d, #0000000000000000
18 ; CHECK-NEXT: movi v1.2d, #0000000000000000
19 ; CHECK-NEXT: mov x8, xzr
20 ; CHECK-NEXT: .LBB0_1: // %vector.body
21 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
22 ; CHECK-NEXT: add x9, x0, x8
23 ; CHECK-NEXT: add x10, x1, x8
24 ; CHECK-NEXT: add x8, x8, #32
25 ; CHECK-NEXT: ldp q3, q2, [x9]
26 ; CHECK-NEXT: cmp x8, #1600
27 ; CHECK-NEXT: ldp q5, q4, [x10]
28 ; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
29 ; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
30 ; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
31 ; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90
32 ; CHECK-NEXT: b.ne .LBB0_1
33 ; CHECK-NEXT: // %bb.2: // %middle.block
34 ; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d
35 ; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
36 ; CHECK-NEXT: faddp d0, v0.2d
37 ; CHECK-NEXT: faddp d1, v2.2d
42 vector.body: ; preds = %vector.body, %entry
43 %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
44 %vec.phi = phi <2 x double> [ zeroinitializer, %entry ], [ %7, %vector.body ]
45 %vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %5, %vector.body ]
46 %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
47 %scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
48 %wide.vec = load <4 x double>, ptr %scevgep, align 8
49 %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
50 %strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
51 %wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
52 %strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
53 %strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
54 %0 = fmul fast <2 x double> %strided.vec31, %strided.vec
55 %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
56 %2 = fmul fast <2 x double> %strided.vec30, %strided.vec
57 %3 = fadd fast <2 x double> %2, %vec.phi27
58 %4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
59 %5 = fsub fast <2 x double> %3, %4
60 %6 = fadd fast <2 x double> %1, %vec.phi
61 %7 = fadd fast <2 x double> %6, %0
62 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
63 %8 = icmp eq i64 %lsr.iv.next, 1600
64 br i1 %8, label %middle.block, label %vector.body
66 middle.block: ; preds = %vector.body
67 %9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
68 %10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
69 %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
70 %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
71 ret %"struct.std::complex" %.fca.0.1.insert
74 ; Fixed value initialized reduction
76 ; complex<double> x = 2.0 + 1.0i;
77 ; for (int i = 0; i < 100; ++i)
80 define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
81 ; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
82 ; CHECK: // %bb.0: // %entry
83 ; CHECK-NEXT: movi v0.2d, #0000000000000000
84 ; CHECK-NEXT: adrp x8, .LCPI1_0
85 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
86 ; CHECK-NEXT: mov x8, xzr
87 ; CHECK-NEXT: .LBB1_1: // %vector.body
88 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
89 ; CHECK-NEXT: add x9, x0, x8
90 ; CHECK-NEXT: add x10, x1, x8
91 ; CHECK-NEXT: add x8, x8, #32
92 ; CHECK-NEXT: ldp q3, q2, [x9]
93 ; CHECK-NEXT: cmp x8, #1600
94 ; CHECK-NEXT: ldp q5, q4, [x10]
95 ; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
96 ; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
97 ; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
98 ; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90
99 ; CHECK-NEXT: b.ne .LBB1_1
100 ; CHECK-NEXT: // %bb.2: // %middle.block
101 ; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d
102 ; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d
103 ; CHECK-NEXT: faddp d0, v0.2d
104 ; CHECK-NEXT: faddp d1, v2.2d
107 br label %vector.body
109 vector.body: ; preds = %vector.body, %entry
110 %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 0, %entry ]
111 %vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %7, %vector.body ]
112 %vec.phi27 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %5, %vector.body ]
113 %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
114 %scevgep35 = getelementptr i8, ptr %b, i64 %lsr.iv
115 %wide.vec = load <4 x double>, ptr %scevgep, align 8
116 %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
117 %strided.vec28 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
118 %wide.vec29 = load <4 x double>, ptr %scevgep35, align 8
119 %strided.vec30 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 0, i32 2>
120 %strided.vec31 = shufflevector <4 x double> %wide.vec29, <4 x double> poison, <2 x i32> <i32 1, i32 3>
121 %0 = fmul fast <2 x double> %strided.vec31, %strided.vec
122 %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28
123 %2 = fmul fast <2 x double> %strided.vec30, %strided.vec
124 %3 = fadd fast <2 x double> %2, %vec.phi27
125 %4 = fmul fast <2 x double> %strided.vec31, %strided.vec28
126 %5 = fsub fast <2 x double> %3, %4
127 %6 = fadd fast <2 x double> %1, %vec.phi
128 %7 = fadd fast <2 x double> %6, %0
129 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 32
130 %8 = icmp eq i64 %lsr.iv.next, 1600
131 br i1 %8, label %middle.block, label %vector.body
133 middle.block: ; preds = %vector.body
134 %9 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %5)
135 %10 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %7)
136 %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %9, 0, 0
137 %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %10, 0, 1
138 ret %"struct.std::complex" %.fca.0.1.insert
141 ; Loop unrolled with factor 2
143 define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
144 ; CHECK-LABEL: complex_mul_v2f64_unrolled:
145 ; CHECK: // %bb.0: // %entry
146 ; CHECK-NEXT: movi v0.2d, #0000000000000000
147 ; CHECK-NEXT: movi v1.2d, #0000000000000000
148 ; CHECK-NEXT: adrp x8, .LCPI2_0
149 ; CHECK-NEXT: movi v3.2d, #0000000000000000
150 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
151 ; CHECK-NEXT: add x8, x0, #32
152 ; CHECK-NEXT: add x9, x1, #32
153 ; CHECK-NEXT: mov x10, #-100 // =0xffffffffffffff9c
154 ; CHECK-NEXT: .LBB2_1: // %vector.body
155 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
156 ; CHECK-NEXT: ldp q5, q4, [x8, #-32]
157 ; CHECK-NEXT: adds x10, x10, #4
158 ; CHECK-NEXT: ldp q7, q6, [x9, #-32]
159 ; CHECK-NEXT: ldp q17, q16, [x8], #64
160 ; CHECK-NEXT: ldp q19, q18, [x9], #64
161 ; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0
162 ; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0
163 ; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0
164 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0
165 ; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90
166 ; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90
167 ; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90
168 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90
169 ; CHECK-NEXT: b.ne .LBB2_1
170 ; CHECK-NEXT: // %bb.2: // %middle.block
171 ; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d
172 ; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d
173 ; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d
174 ; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d
175 ; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d
176 ; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d
177 ; CHECK-NEXT: faddp d0, v0.2d
178 ; CHECK-NEXT: faddp d1, v1.2d
181 %scevgep = getelementptr i8, ptr %a, i64 32
182 %scevgep49 = getelementptr i8, ptr %b, i64 32
183 br label %vector.body
185 vector.body: ; preds = %vector.body, %entry
186 %lsr.iv54 = phi i64 [ %lsr.iv.next, %vector.body ], [ 100, %entry ]
187 %lsr.iv50 = phi ptr [ %scevgep51, %vector.body ], [ %scevgep49, %entry ]
188 %lsr.iv = phi ptr [ %scevgep48, %vector.body ], [ %scevgep, %entry ]
189 %vec.phi = phi <2 x double> [ <double 1.000000e+00, double 0.000000e+00>, %entry ], [ %14, %vector.body ]
190 %vec.phi27 = phi <2 x double> [ zeroinitializer, %entry ], [ %15, %vector.body ]
191 %vec.phi28 = phi <2 x double> [ <double 2.000000e+00, double 0.000000e+00>, %entry ], [ %10, %vector.body ]
192 %vec.phi29 = phi <2 x double> [ zeroinitializer, %entry ], [ %11, %vector.body ]
193 %scevgep52 = getelementptr i8, ptr %lsr.iv, i64 -32
194 %scevgep53 = getelementptr i8, ptr %lsr.iv50, i64 -32
195 %wide.vec = load <4 x double>, ptr %scevgep52, align 8
196 %wide.vec30 = load <4 x double>, ptr %lsr.iv, align 8
197 %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 0, i32 2>
198 %strided.vec31 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 0, i32 2>
199 %strided.vec32 = shufflevector <4 x double> %wide.vec, <4 x double> poison, <2 x i32> <i32 1, i32 3>
200 %strided.vec33 = shufflevector <4 x double> %wide.vec30, <4 x double> poison, <2 x i32> <i32 1, i32 3>
201 %wide.vec34 = load <4 x double>, ptr %scevgep53, align 8
202 %wide.vec35 = load <4 x double>, ptr %lsr.iv50, align 8
203 %strided.vec36 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 0, i32 2>
204 %strided.vec37 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 0, i32 2>
205 %strided.vec38 = shufflevector <4 x double> %wide.vec34, <4 x double> poison, <2 x i32> <i32 1, i32 3>
206 %strided.vec39 = shufflevector <4 x double> %wide.vec35, <4 x double> poison, <2 x i32> <i32 1, i32 3>
207 %0 = fmul fast <2 x double> %strided.vec38, %strided.vec
208 %1 = fmul fast <2 x double> %strided.vec39, %strided.vec31
209 %2 = fmul fast <2 x double> %strided.vec36, %strided.vec32
210 %3 = fmul fast <2 x double> %strided.vec37, %strided.vec33
211 %4 = fmul fast <2 x double> %strided.vec36, %strided.vec
212 %5 = fmul fast <2 x double> %strided.vec37, %strided.vec31
213 %6 = fadd fast <2 x double> %4, %vec.phi28
214 %7 = fadd fast <2 x double> %5, %vec.phi29
215 %8 = fmul fast <2 x double> %strided.vec38, %strided.vec32
216 %9 = fmul fast <2 x double> %strided.vec39, %strided.vec33
217 %10 = fsub fast <2 x double> %6, %8
218 %11 = fsub fast <2 x double> %7, %9
219 %12 = fadd fast <2 x double> %2, %vec.phi
220 %13 = fadd fast <2 x double> %3, %vec.phi27
221 %14 = fadd fast <2 x double> %12, %0
222 %15 = fadd fast <2 x double> %13, %1
223 %scevgep48 = getelementptr i8, ptr %lsr.iv, i64 64
224 %scevgep51 = getelementptr i8, ptr %lsr.iv50, i64 64
225 %lsr.iv.next = add nsw i64 %lsr.iv54, -4
226 %16 = icmp eq i64 %lsr.iv.next, 0
227 br i1 %16, label %middle.block, label %vector.body
229 middle.block: ; preds = %vector.body
230 %bin.rdx40 = fadd fast <2 x double> %11, %10
231 %17 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx40)
232 %bin.rdx = fadd fast <2 x double> %15, %14
233 %18 = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx)
234 %.fca.0.0.insert = insertvalue %"struct.std::complex" poison, double %17, 0, 0
235 %.fca.0.1.insert = insertvalue %"struct.std::complex" %.fca.0.0.insert, double %18, 0, 1
236 ret %"struct.std::complex" %.fca.0.1.insert
239 ; The reduced bug from D153355. Shows that reduction was detected where it did not exist.
240 define void @incorrect_reduction_pattern(i1 %exitcond.not) {
241 ; CHECK-LABEL: incorrect_reduction_pattern:
242 ; CHECK: // %bb.0: // %entry
243 ; CHECK-NEXT: .LBB3_1: // %for.body
244 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
245 ; CHECK-NEXT: tbz w0, #0, .LBB3_1
246 ; CHECK-NEXT: // %bb.2: // %for.end.loopexit
251 for.body: ; preds = %for.body, %entry
252 %vec_r = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_r, %for.body ]
253 %vec_i = phi <4 x float> [ zeroinitializer, %entry ], [ %lane_i, %for.body ]
254 %add = fadd <4 x float> %vec_r, %vec_i
255 %lane_r = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
256 %lane_i = shufflevector <4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> zeroinitializer, <4 x i32> zeroinitializer
257 br i1 %exitcond.not, label %for.end.loopexit, label %for.body
259 for.end.loopexit: ; preds = %for.body
260 %mul.r = fadd <4 x float> %lane_r, %add
261 %mul.i = fadd <4 x float> %lane_i, %add
265 declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)