1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LE %s
3 ; RUN: llc < %s -mtriple aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
5 define <4 x i16> @test_combine_v4i16_v2i64(<2 x i64> %a, <2 x i64> %b) {
6 ; CHECK-LE-LABEL: test_combine_v4i16_v2i64:
8 ; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
9 ; CHECK-LE-NEXT: xtn v0.4h, v0.4s
12 ; CHECK-BE-LABEL: test_combine_v4i16_v2i64:
14 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
15 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
16 ; CHECK-BE-NEXT: xtn v0.2s, v0.2d
17 ; CHECK-BE-NEXT: xtn v1.2s, v1.2d
18 ; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
19 ; CHECK-BE-NEXT: rev32 v1.4h, v1.4h
20 ; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
21 ; CHECK-BE-NEXT: rev64 v0.4h, v0.4h
23 %a1 = trunc <2 x i64> %a to <2 x i32>
24 %b1 = trunc <2 x i64> %b to <2 x i32>
26 %a2 = bitcast <2 x i32> %a1 to <4 x i16>
27 %b2 = bitcast <2 x i32> %b1 to <4 x i16>
29 %ab = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
33 define <4 x i16> @test_combine_v4i16_v4i32(<4 x i32> %a, <4 x i32> %b) {
34 ; CHECK-LE-LABEL: test_combine_v4i16_v4i32:
36 ; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
37 ; CHECK-LE-NEXT: xtn v0.4h, v0.4s
40 ; CHECK-BE-LABEL: test_combine_v4i16_v4i32:
42 ; CHECK-BE-NEXT: rev64 v1.4s, v1.4s
43 ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
44 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
45 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
46 ; CHECK-BE-NEXT: xtn v0.4h, v0.4s
47 ; CHECK-BE-NEXT: xtn v1.4h, v1.4s
48 ; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
49 ; CHECK-BE-NEXT: rev64 v0.4h, v0.4h
51 %a1 = trunc <4 x i32> %a to <4 x i16>
52 %b1 = trunc <4 x i32> %b to <4 x i16>
54 %ab = shufflevector <4 x i16> %a1, <4 x i16> %b1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
58 define <4 x i16> @test_combine_v4i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
59 ; CHECK-LE-LABEL: test_combine_v4i16_v8i16:
61 ; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
62 ; CHECK-LE-NEXT: xtn v0.4h, v0.4s
65 ; CHECK-BE-LABEL: test_combine_v4i16_v8i16:
67 ; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
68 ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
69 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
70 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
71 ; CHECK-BE-NEXT: xtn v0.8b, v0.8h
72 ; CHECK-BE-NEXT: xtn v1.8b, v1.8h
73 ; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
74 ; CHECK-BE-NEXT: rev16 v1.8b, v1.8b
75 ; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
76 ; CHECK-BE-NEXT: rev64 v0.4h, v0.4h
78 %a1 = trunc <8 x i16> %a to <8 x i8>
79 %b1 = trunc <8 x i16> %b to <8 x i8>
81 %a2 = bitcast <8 x i8> %a1 to <4 x i16>
82 %b2 = bitcast <8 x i8> %b1 to <4 x i16>
84 %ab = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
89 define <8 x i8> @test_combine_v8i8_v2i64(<2 x i64> %a, <2 x i64> %b) {
90 ; CHECK-LE-LABEL: test_combine_v8i8_v2i64:
92 ; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
93 ; CHECK-LE-NEXT: xtn v0.8b, v0.8h
96 ; CHECK-BE-LABEL: test_combine_v8i8_v2i64:
98 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
99 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
100 ; CHECK-BE-NEXT: xtn v0.2s, v0.2d
101 ; CHECK-BE-NEXT: xtn v1.2s, v1.2d
102 ; CHECK-BE-NEXT: rev32 v0.8b, v0.8b
103 ; CHECK-BE-NEXT: rev32 v1.8b, v1.8b
104 ; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
105 ; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
107 %a1 = trunc <2 x i64> %a to <2 x i32>
108 %b1 = trunc <2 x i64> %b to <2 x i32>
110 %a2 = bitcast <2 x i32> %a1 to <8 x i8>
111 %b2 = bitcast <2 x i32> %b1 to <8 x i8>
113 %ab = shufflevector <8 x i8> %a2, <8 x i8> %b2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
117 define <8 x i8> @test_combine_v8i8_v4i32(<4 x i32> %a, <4 x i32> %b) {
118 ; CHECK-LE-LABEL: test_combine_v8i8_v4i32:
119 ; CHECK-LE: // %bb.0:
120 ; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
121 ; CHECK-LE-NEXT: xtn v0.8b, v0.8h
124 ; CHECK-BE-LABEL: test_combine_v8i8_v4i32:
125 ; CHECK-BE: // %bb.0:
126 ; CHECK-BE-NEXT: rev64 v1.4s, v1.4s
127 ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
128 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
129 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
130 ; CHECK-BE-NEXT: xtn v0.4h, v0.4s
131 ; CHECK-BE-NEXT: xtn v1.4h, v1.4s
132 ; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
133 ; CHECK-BE-NEXT: rev16 v1.8b, v1.8b
134 ; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
135 ; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
137 %a1 = trunc <4 x i32> %a to <4 x i16>
138 %b1 = trunc <4 x i32> %b to <4 x i16>
140 %a2 = bitcast <4 x i16> %a1 to <8 x i8>
141 %b2 = bitcast <4 x i16> %b1 to <8 x i8>
143 %ab = shufflevector <8 x i8> %a2, <8 x i8> %b2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
147 define <8 x i8> @test_combine_v8i8_v8i16(<8 x i16> %a, <8 x i16> %b) {
148 ; CHECK-LE-LABEL: test_combine_v8i8_v8i16:
149 ; CHECK-LE: // %bb.0:
150 ; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
151 ; CHECK-LE-NEXT: xtn v0.8b, v0.8h
154 ; CHECK-BE-LABEL: test_combine_v8i8_v8i16:
155 ; CHECK-BE: // %bb.0:
156 ; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
157 ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
158 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
159 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
160 ; CHECK-BE-NEXT: xtn v0.8b, v0.8h
161 ; CHECK-BE-NEXT: xtn v1.8b, v1.8h
162 ; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
163 ; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
165 %a1 = trunc <8 x i16> %a to <8 x i8>
166 %b1 = trunc <8 x i16> %b to <8 x i8>
168 %ab = shufflevector <8 x i8> %a1, <8 x i8> %b1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
172 define <2 x i32> @test_combine_v2i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
173 ; CHECK-LE-LABEL: test_combine_v2i32_v2i64:
174 ; CHECK-LE: // %bb.0:
175 ; CHECK-LE-NEXT: xtn v0.2s, v0.2d
176 ; CHECK-LE-NEXT: xtn v1.2s, v1.2d
177 ; CHECK-LE-NEXT: zip1 v0.2s, v0.2s, v1.2s
180 ; CHECK-BE-LABEL: test_combine_v2i32_v2i64:
181 ; CHECK-BE: // %bb.0:
182 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
183 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
184 ; CHECK-BE-NEXT: xtn v0.2s, v0.2d
185 ; CHECK-BE-NEXT: xtn v1.2s, v1.2d
186 ; CHECK-BE-NEXT: zip1 v0.2s, v0.2s, v1.2s
187 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
189 %a1 = trunc <2 x i64> %a to <2 x i32>
190 %b1 = trunc <2 x i64> %b to <2 x i32>
192 %ab = shufflevector <2 x i32> %a1, <2 x i32> %b1, <2 x i32> <i32 0, i32 2>
196 define <2 x i32> @test_combine_v2i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
197 ; CHECK-LE-LABEL: test_combine_v2i32_v4i32:
198 ; CHECK-LE: // %bb.0:
199 ; CHECK-LE-NEXT: xtn v0.4h, v0.4s
200 ; CHECK-LE-NEXT: xtn v1.4h, v1.4s
201 ; CHECK-LE-NEXT: zip1 v0.2s, v0.2s, v1.2s
204 ; CHECK-BE-LABEL: test_combine_v2i32_v4i32:
205 ; CHECK-BE: // %bb.0:
206 ; CHECK-BE-NEXT: rev64 v1.4s, v1.4s
207 ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s
208 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
209 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
210 ; CHECK-BE-NEXT: xtn v0.4h, v0.4s
211 ; CHECK-BE-NEXT: xtn v1.4h, v1.4s
212 ; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
213 ; CHECK-BE-NEXT: rev32 v1.4h, v1.4h
214 ; CHECK-BE-NEXT: zip1 v0.2s, v0.2s, v1.2s
215 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
217 %a1 = trunc <4 x i32> %a to <4 x i16>
218 %b1 = trunc <4 x i32> %b to <4 x i16>
220 %a2 = bitcast <4 x i16> %a1 to <2 x i32>
221 %b2 = bitcast <4 x i16> %b1 to <2 x i32>
223 %ab = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
227 define <2 x i32> @test_combine_v2i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
228 ; CHECK-LE-LABEL: test_combine_v2i32_v8i16:
229 ; CHECK-LE: // %bb.0:
230 ; CHECK-LE-NEXT: xtn v0.8b, v0.8h
231 ; CHECK-LE-NEXT: xtn v1.8b, v1.8h
232 ; CHECK-LE-NEXT: zip1 v0.2s, v0.2s, v1.2s
235 ; CHECK-BE-LABEL: test_combine_v2i32_v8i16:
236 ; CHECK-BE: // %bb.0:
237 ; CHECK-BE-NEXT: rev64 v1.8h, v1.8h
238 ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
239 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
240 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
241 ; CHECK-BE-NEXT: xtn v0.8b, v0.8h
242 ; CHECK-BE-NEXT: xtn v1.8b, v1.8h
243 ; CHECK-BE-NEXT: rev32 v0.8b, v0.8b
244 ; CHECK-BE-NEXT: rev32 v1.8b, v1.8b
245 ; CHECK-BE-NEXT: zip1 v0.2s, v0.2s, v1.2s
246 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
248 %a1 = trunc <8 x i16> %a to <8 x i8>
249 %b1 = trunc <8 x i16> %b to <8 x i8>
251 %a2 = bitcast <8 x i8> %a1 to <2 x i32>
252 %b2 = bitcast <8 x i8> %b1 to <2 x i32>
254 %ab = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
258 define i8 @trunc_v4i64_v4i8(<4 x i64> %input) {
259 ; CHECK-LE-LABEL: trunc_v4i64_v4i8:
260 ; CHECK-LE: // %bb.0:
261 ; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
262 ; CHECK-LE-NEXT: xtn v0.4h, v0.4s
263 ; CHECK-LE-NEXT: addv h0, v0.4h
264 ; CHECK-LE-NEXT: fmov w0, s0
267 ; CHECK-BE-LABEL: trunc_v4i64_v4i8:
268 ; CHECK-BE: // %bb.0:
269 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
270 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
271 ; CHECK-BE-NEXT: xtn v1.2s, v1.2d
272 ; CHECK-BE-NEXT: xtn v0.2s, v0.2d
273 ; CHECK-BE-NEXT: rev32 v1.4h, v1.4h
274 ; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
275 ; CHECK-BE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
276 ; CHECK-BE-NEXT: addv h0, v0.4h
277 ; CHECK-BE-NEXT: fmov w0, s0
279 %var = trunc <4 x i64> %input to <4 x i8>
280 ; llvm.vector.reduce.add.v4i8 is needed to reproduce the codegen (see `trunc_v4i64_v4i8_ret` below as a comparison)
281 %res = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %var)
285 define <4 x i8> @trunc_v4i64_v4i8_ret(<4 x i64> %input) {
286 ; CHECK-LE-LABEL: trunc_v4i64_v4i8_ret:
287 ; CHECK-LE: // %bb.0:
288 ; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
289 ; CHECK-LE-NEXT: xtn v0.4h, v0.4s
292 ; CHECK-BE-LABEL: trunc_v4i64_v4i8_ret:
293 ; CHECK-BE: // %bb.0:
294 ; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
295 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
296 ; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
297 ; CHECK-BE-NEXT: xtn v0.4h, v0.4s
298 ; CHECK-BE-NEXT: rev64 v0.4h, v0.4h
300 %var = trunc <4 x i64> %input to <4 x i8>
304 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)