1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
4 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
5 target triple = "aarch64--linux-gnu"
7 define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
8 ; CHECK-LABEL: @build_vec_v2i64(
9 ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
10 ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
11 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
12 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
13 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
14 ; CHECK-NEXT: ret <2 x i64> [[TMP5]]
16 %v0.0 = extractelement <2 x i64> %v0, i32 0
17 %v0.1 = extractelement <2 x i64> %v0, i32 1
18 %v1.0 = extractelement <2 x i64> %v1, i32 0
19 %v1.1 = extractelement <2 x i64> %v1, i32 1
20 %tmp0.0 = add i64 %v0.0, %v1.0
21 %tmp0.1 = add i64 %v0.1, %v1.1
22 %tmp1.0 = sub i64 %v0.0, %v1.0
23 %tmp1.1 = sub i64 %v0.1, %v1.1
24 %tmp2.0 = add i64 %tmp0.0, %tmp0.1
25 %tmp2.1 = add i64 %tmp1.0, %tmp1.1
26 %tmp3.0 = insertelement <2 x i64> poison, i64 %tmp2.0, i32 0
27 %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1
31 define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) {
32 ; CHECK-LABEL: @store_chain_v2i64(
33 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
34 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8
35 ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
36 ; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
37 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 1, i32 2>
38 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
39 ; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]]
40 ; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8
41 ; CHECK-NEXT: ret void
43 %a.1 = getelementptr i64, ptr %a, i64 1
44 %b.1 = getelementptr i64, ptr %b, i64 1
45 %c.1 = getelementptr i64, ptr %c, i64 1
46 %v0.0 = load i64, ptr %a, align 8
47 %v0.1 = load i64, ptr %a.1, align 8
48 %v1.0 = load i64, ptr %b, align 8
49 %v1.1 = load i64, ptr %b.1, align 8
50 %tmp0.0 = add i64 %v0.0, %v1.0
51 %tmp0.1 = add i64 %v0.1, %v1.1
52 %tmp1.0 = sub i64 %v0.0, %v1.0
53 %tmp1.1 = sub i64 %v0.1, %v1.1
54 %tmp2.0 = add i64 %tmp0.0, %tmp0.1
55 %tmp2.1 = add i64 %tmp1.0, %tmp1.1
56 store i64 %tmp2.0, ptr %c, align 8
57 store i64 %tmp2.1, ptr %c.1, align 8
61 define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
62 ; CHECK-LABEL: @build_vec_v4i32(
63 ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
64 ; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
65 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
66 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
67 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
68 ; CHECK-NEXT: ret <4 x i32> [[TMP5]]
70 %v0.0 = extractelement <4 x i32> %v0, i32 0
71 %v0.1 = extractelement <4 x i32> %v0, i32 1
72 %v0.2 = extractelement <4 x i32> %v0, i32 2
73 %v0.3 = extractelement <4 x i32> %v0, i32 3
74 %v1.0 = extractelement <4 x i32> %v1, i32 0
75 %v1.1 = extractelement <4 x i32> %v1, i32 1
76 %v1.2 = extractelement <4 x i32> %v1, i32 2
77 %v1.3 = extractelement <4 x i32> %v1, i32 3
78 %tmp0.0 = add i32 %v0.0, %v1.0
79 %tmp0.1 = add i32 %v0.1, %v1.1
80 %tmp0.2 = add i32 %v0.2, %v1.2
81 %tmp0.3 = add i32 %v0.3, %v1.3
82 %tmp1.0 = sub i32 %v0.0, %v1.0
83 %tmp1.1 = sub i32 %v0.1, %v1.1
84 %tmp1.2 = sub i32 %v0.2, %v1.2
85 %tmp1.3 = sub i32 %v0.3, %v1.3
86 %tmp2.0 = add i32 %tmp0.0, %tmp0.1
87 %tmp2.1 = add i32 %tmp1.0, %tmp1.1
88 %tmp2.2 = add i32 %tmp0.2, %tmp0.3
89 %tmp2.3 = add i32 %tmp1.2, %tmp1.3
90 %tmp3.0 = insertelement <4 x i32> poison, i32 %tmp2.0, i32 0
91 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
92 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
93 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
97 define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
98 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
99 ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
100 ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
101 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
102 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
103 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
104 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
105 ; CHECK-NEXT: ret <4 x i32> [[TMP6]]
107 %v0.0 = extractelement <2 x i32> %v0, i32 0
108 %v0.1 = extractelement <2 x i32> %v0, i32 1
109 %v1.0 = extractelement <2 x i32> %v1, i32 0
110 %v1.1 = extractelement <2 x i32> %v1, i32 1
111 %tmp0.0 = add i32 %v0.0, %v1.0
112 %tmp0.1 = add i32 %v0.1, %v1.1
113 %tmp1.0 = sub i32 %v0.0, %v1.0
114 %tmp1.1 = sub i32 %v0.1, %v1.1
115 %tmp2.0 = add i32 %tmp0.0, %tmp0.1
116 %tmp2.1 = add i32 %tmp1.0, %tmp1.1
117 %tmp3.0 = insertelement <4 x i32> poison, i32 %tmp2.0, i32 0
118 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
119 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2
120 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3
121 ret <4 x i32> %tmp3.3
124 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
125 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
126 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
127 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
128 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
129 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
130 ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
131 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
132 ; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
133 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
134 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
135 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
136 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
137 ; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
138 ; CHECK-NEXT: ret <4 x i32> [[TMP11]]
140 %v0.0 = extractelement <2 x i32> %v0, i32 0
141 %v0.1 = extractelement <2 x i32> %v0, i32 1
142 %v1.0 = extractelement <2 x i32> %v1, i32 0
143 %v1.1 = extractelement <2 x i32> %v1, i32 1
144 %tmp0.0 = add i32 %v0.0, %v1.0
145 %tmp0.1 = add i32 %v0.1, %v1.1
146 %tmp0.2 = xor i32 %v0.0, %v1.0
147 %tmp0.3 = xor i32 %v0.1, %v1.1
148 %tmp1.0 = sub i32 %tmp0.0, %tmp0.1
149 %tmp1.1 = sub i32 %tmp0.0, %tmp0.1
150 %tmp1.2 = sub i32 %tmp0.2, %tmp0.3
151 %tmp1.3 = sub i32 %tmp0.3, %tmp0.2
152 %tmp2.0 = insertelement <4 x i32> poison, i32 %tmp1.0, i32 0
153 %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1
154 %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2
155 %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3
156 ret <4 x i32> %tmp2.3
159 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
160 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
161 ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
162 ; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
163 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
164 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
165 ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
166 ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
167 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
168 ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
169 ; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
170 ; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
171 ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]]
173 %v0.0 = extractelement <2 x i32> %v0, i32 0
174 %v0.1 = extractelement <2 x i32> %v0, i32 1
175 %v1.0 = extractelement <2 x i32> %v1, i32 0
176 %v1.1 = extractelement <2 x i32> %v1, i32 1
177 %tmp0.0 = add i32 %v0.0, %v1.0
178 %tmp0.1 = add i32 %v0.1, %v1.1
179 %tmp0.2 = xor i32 %v0.0, %v1.0
180 %tmp0.3 = xor i32 %v0.1, %v1.1
181 %tmp1.0 = mul i32 %v0.0, %v1.0
182 %tmp1.1 = mul i32 %v0.1, %v1.1
183 %tmp1.2 = xor i32 %v0.0, %v1.0
184 %tmp1.3 = xor i32 %v0.1, %v1.1
185 %tmp2.0 = add i32 %tmp0.0, %tmp0.1
186 %tmp2.1 = add i32 %tmp1.0, %tmp1.1
187 %tmp2.2 = add i32 %tmp0.2, %tmp0.3
188 %tmp2.3 = add i32 %tmp1.2, %tmp1.3
189 %tmp3.0 = insertelement <4 x i32> poison, i32 %tmp2.0, i32 0
190 %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
191 %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
192 %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
193 ret <4 x i32> %tmp3.3
196 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
197 ; CHECK-LABEL: @reduction_v4i32(
198 ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
199 ; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
200 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
201 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
202 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
203 ; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
204 ; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
205 ; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
206 ; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
207 ; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
208 ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
209 ; CHECK-NEXT: ret i32 [[TMP11]]
211 %v0.0 = extractelement <4 x i32> %v0, i32 0
212 %v0.1 = extractelement <4 x i32> %v0, i32 1
213 %v0.2 = extractelement <4 x i32> %v0, i32 2
214 %v0.3 = extractelement <4 x i32> %v0, i32 3
215 %v1.0 = extractelement <4 x i32> %v1, i32 0
216 %v1.1 = extractelement <4 x i32> %v1, i32 1
217 %v1.2 = extractelement <4 x i32> %v1, i32 2
218 %v1.3 = extractelement <4 x i32> %v1, i32 3
219 %tmp0.0 = add i32 %v0.0, %v1.0
220 %tmp0.1 = add i32 %v0.1, %v1.1
221 %tmp0.2 = add i32 %v0.2, %v1.2
222 %tmp0.3 = add i32 %v0.3, %v1.3
223 %tmp1.0 = sub i32 %v0.0, %v1.0
224 %tmp1.1 = sub i32 %v0.1, %v1.1
225 %tmp1.2 = sub i32 %v0.2, %v1.2
226 %tmp1.3 = sub i32 %v0.3, %v1.3
227 %tmp2.0 = add i32 %tmp0.0, %tmp0.1
228 %tmp2.1 = add i32 %tmp1.0, %tmp1.1
229 %tmp2.2 = add i32 %tmp0.2, %tmp0.3
230 %tmp2.3 = add i32 %tmp1.2, %tmp1.3
231 %tmp3.0 = lshr i32 %tmp2.0, 15
232 %tmp3.1 = lshr i32 %tmp2.1, 15
233 %tmp3.2 = lshr i32 %tmp2.2, 15
234 %tmp3.3 = lshr i32 %tmp2.3, 15
235 %tmp4.0 = and i32 %tmp3.0, 65537
236 %tmp4.1 = and i32 %tmp3.1, 65537
237 %tmp4.2 = and i32 %tmp3.2, 65537
238 %tmp4.3 = and i32 %tmp3.3, 65537
239 %tmp5.0 = mul nuw i32 %tmp4.0, 65535
240 %tmp5.1 = mul nuw i32 %tmp4.1, 65535
241 %tmp5.2 = mul nuw i32 %tmp4.2, 65535
242 %tmp5.3 = mul nuw i32 %tmp4.3, 65535
243 %tmp6.0 = add i32 %tmp5.0, %tmp2.0
244 %tmp6.1 = add i32 %tmp5.1, %tmp2.1
245 %tmp6.2 = add i32 %tmp5.2, %tmp2.2
246 %tmp6.3 = add i32 %tmp5.3, %tmp2.3
247 %tmp7.0 = xor i32 %tmp6.0, %tmp5.0
248 %tmp7.1 = xor i32 %tmp6.1, %tmp5.1
249 %tmp7.2 = xor i32 %tmp6.2, %tmp5.2
250 %tmp7.3 = xor i32 %tmp6.3, %tmp5.3
251 %reduce.0 = add i32 %tmp7.1, %tmp7.0
252 %reduce.1 = add i32 %reduce.0, %tmp7.2
253 %reduce.2 = add i32 %reduce.1, %tmp7.3