1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3 ; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5 ; CHECK-GI: warning: Instruction selection used fallback path for saddlp1d
6 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uaddlp1d
8 define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
9 ; CHECK-LABEL: addhn8b:
11 ; CHECK-NEXT: ldr q0, [x0]
12 ; CHECK-NEXT: ldr q1, [x1]
13 ; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h
15 %tmp1 = load <8 x i16>, ptr %A
16 %tmp2 = load <8 x i16>, ptr %B
17 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
21 define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
22 ; CHECK-LABEL: addhn4h:
24 ; CHECK-NEXT: ldr q0, [x0]
25 ; CHECK-NEXT: ldr q1, [x1]
26 ; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s
28 %tmp1 = load <4 x i32>, ptr %A
29 %tmp2 = load <4 x i32>, ptr %B
30 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
34 define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
35 ; CHECK-LABEL: addhn2s:
37 ; CHECK-NEXT: ldr q0, [x0]
38 ; CHECK-NEXT: ldr q1, [x1]
39 ; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d
41 %tmp1 = load <2 x i64>, ptr %A
42 %tmp2 = load <2 x i64>, ptr %B
43 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
47 define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
48 ; CHECK-LABEL: addhn2_16b:
50 ; CHECK-NEXT: addhn v2.8b, v0.8h, v1.8h
51 ; CHECK-NEXT: addhn2 v2.16b, v0.8h, v1.8h
52 ; CHECK-NEXT: mov v0.16b, v2.16b
54 %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
55 %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
56 %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
60 define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
61 ; CHECK-LABEL: addhn2_8h:
63 ; CHECK-NEXT: addhn v2.4h, v0.4s, v1.4s
64 ; CHECK-NEXT: addhn2 v2.8h, v0.4s, v1.4s
65 ; CHECK-NEXT: mov v0.16b, v2.16b
67 %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
68 %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
69 %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
73 define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
74 ; CHECK-LABEL: addhn2_4s:
76 ; CHECK-NEXT: addhn v2.2s, v0.2d, v1.2d
77 ; CHECK-NEXT: addhn2 v2.4s, v0.2d, v1.2d
78 ; CHECK-NEXT: mov v0.16b, v2.16b
80 %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
81 %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
82 %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
86 declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
87 declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
88 declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
91 define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
92 ; CHECK-LABEL: raddhn8b:
94 ; CHECK-NEXT: ldr q0, [x0]
95 ; CHECK-NEXT: ldr q1, [x1]
96 ; CHECK-NEXT: raddhn v0.8b, v0.8h, v1.8h
98 %tmp1 = load <8 x i16>, ptr %A
99 %tmp2 = load <8 x i16>, ptr %B
100 %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
104 define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
105 ; CHECK-LABEL: raddhn4h:
107 ; CHECK-NEXT: ldr q0, [x0]
108 ; CHECK-NEXT: ldr q1, [x1]
109 ; CHECK-NEXT: raddhn v0.4h, v0.4s, v1.4s
111 %tmp1 = load <4 x i32>, ptr %A
112 %tmp2 = load <4 x i32>, ptr %B
113 %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
117 define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
118 ; CHECK-LABEL: raddhn2s:
120 ; CHECK-NEXT: ldr q0, [x0]
121 ; CHECK-NEXT: ldr q1, [x1]
122 ; CHECK-NEXT: raddhn v0.2s, v0.2d, v1.2d
124 %tmp1 = load <2 x i64>, ptr %A
125 %tmp2 = load <2 x i64>, ptr %B
126 %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
130 define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
131 ; CHECK-LABEL: raddhn2_16b:
133 ; CHECK-NEXT: raddhn v2.8b, v0.8h, v1.8h
134 ; CHECK-NEXT: raddhn2 v2.16b, v0.8h, v1.8h
135 ; CHECK-NEXT: mov v0.16b, v2.16b
137 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
138 %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
139 %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
143 define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
144 ; CHECK-LABEL: raddhn2_8h:
146 ; CHECK-NEXT: raddhn v2.4h, v0.4s, v1.4s
147 ; CHECK-NEXT: raddhn2 v2.8h, v0.4s, v1.4s
148 ; CHECK-NEXT: mov v0.16b, v2.16b
150 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
151 %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
152 %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
156 define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
157 ; CHECK-LABEL: raddhn2_4s:
159 ; CHECK-NEXT: raddhn v2.2s, v0.2d, v1.2d
160 ; CHECK-NEXT: raddhn2 v2.4s, v0.2d, v1.2d
161 ; CHECK-NEXT: mov v0.16b, v2.16b
163 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
164 %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
165 %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
169 declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
170 declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
171 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
173 define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
174 ; CHECK-LABEL: saddl8h:
176 ; CHECK-NEXT: ldr d0, [x0]
177 ; CHECK-NEXT: ldr d1, [x1]
178 ; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b
180 %tmp1 = load <8 x i8>, ptr %A
181 %tmp2 = load <8 x i8>, ptr %B
182 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
183 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
184 %tmp5 = add <8 x i16> %tmp3, %tmp4
188 define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
189 ; CHECK-LABEL: saddl4s:
191 ; CHECK-NEXT: ldr d0, [x0]
192 ; CHECK-NEXT: ldr d1, [x1]
193 ; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
195 %tmp1 = load <4 x i16>, ptr %A
196 %tmp2 = load <4 x i16>, ptr %B
197 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
198 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
199 %tmp5 = add <4 x i32> %tmp3, %tmp4
203 define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
204 ; CHECK-LABEL: saddl2d:
206 ; CHECK-NEXT: ldr d0, [x0]
207 ; CHECK-NEXT: ldr d1, [x1]
208 ; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
210 %tmp1 = load <2 x i32>, ptr %A
211 %tmp2 = load <2 x i32>, ptr %B
212 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
213 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
214 %tmp5 = add <2 x i64> %tmp3, %tmp4
218 define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
219 ; CHECK-LABEL: saddl2_8h:
221 ; CHECK-NEXT: saddl2 v0.8h, v0.16b, v1.16b
223 %tmp = bitcast <16 x i8> %a to <2 x i64>
224 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
225 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
226 %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
227 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
228 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
229 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
230 %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
231 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
235 define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
236 ; CHECK-LABEL: saddl2_4s:
238 ; CHECK-NEXT: saddl2 v0.4s, v0.8h, v1.8h
240 %tmp = bitcast <8 x i16> %a to <2 x i64>
241 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
242 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
243 %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
244 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
245 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
246 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
247 %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
248 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
252 define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
253 ; CHECK-LABEL: saddl2_2d:
255 ; CHECK-NEXT: saddl2 v0.2d, v0.4s, v1.4s
257 %tmp = bitcast <4 x i32> %a to <2 x i64>
258 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
259 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
260 %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
261 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
262 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
263 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
264 %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
265 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
269 define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
270 ; CHECK-LABEL: uaddl8h:
272 ; CHECK-NEXT: ldr d0, [x0]
273 ; CHECK-NEXT: ldr d1, [x1]
274 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
276 %tmp1 = load <8 x i8>, ptr %A
277 %tmp2 = load <8 x i8>, ptr %B
278 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
279 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
280 %tmp5 = add <8 x i16> %tmp3, %tmp4
284 define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
285 ; CHECK-LABEL: uaddl4s:
287 ; CHECK-NEXT: ldr d0, [x0]
288 ; CHECK-NEXT: ldr d1, [x1]
289 ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
291 %tmp1 = load <4 x i16>, ptr %A
292 %tmp2 = load <4 x i16>, ptr %B
293 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
294 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
295 %tmp5 = add <4 x i32> %tmp3, %tmp4
299 define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
300 ; CHECK-LABEL: uaddl2d:
302 ; CHECK-NEXT: ldr d0, [x0]
303 ; CHECK-NEXT: ldr d1, [x1]
304 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
306 %tmp1 = load <2 x i32>, ptr %A
307 %tmp2 = load <2 x i32>, ptr %B
308 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
309 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
310 %tmp5 = add <2 x i64> %tmp3, %tmp4
315 define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
316 ; CHECK-LABEL: uaddl2_8h:
318 ; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b
320 %tmp = bitcast <16 x i8> %a to <2 x i64>
321 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
322 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
323 %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
324 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
325 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
326 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
327 %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
328 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
332 define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
333 ; CHECK-LABEL: uaddl2_4s:
335 ; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v1.8h
337 %tmp = bitcast <8 x i16> %a to <2 x i64>
338 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
339 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
340 %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
341 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
342 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
343 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
344 %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
345 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
349 define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
350 ; CHECK-LABEL: uaddl2_2d:
352 ; CHECK-NEXT: uaddl2 v0.2d, v0.4s, v1.4s
354 %tmp = bitcast <4 x i32> %a to <2 x i64>
355 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
356 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
357 %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
358 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
359 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
360 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
361 %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
362 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
366 define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
367 ; CHECK-LABEL: uaddw8h:
369 ; CHECK-NEXT: ldr q0, [x0]
370 ; CHECK-NEXT: ldr d1, [x1]
371 ; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b
373 %tmp1 = load <8 x i16>, ptr %A
374 %tmp2 = load <8 x i8>, ptr %B
375 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
376 %tmp4 = add <8 x i16> %tmp1, %tmp3
380 define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
381 ; CHECK-LABEL: uaddw4s:
383 ; CHECK-NEXT: ldr q0, [x0]
384 ; CHECK-NEXT: ldr d1, [x1]
385 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
387 %tmp1 = load <4 x i32>, ptr %A
388 %tmp2 = load <4 x i16>, ptr %B
389 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
390 %tmp4 = add <4 x i32> %tmp1, %tmp3
394 define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
395 ; CHECK-LABEL: uaddw2d:
397 ; CHECK-NEXT: ldr q0, [x0]
398 ; CHECK-NEXT: ldr d1, [x1]
399 ; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s
401 %tmp1 = load <2 x i64>, ptr %A
402 %tmp2 = load <2 x i32>, ptr %B
403 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
404 %tmp4 = add <2 x i64> %tmp1, %tmp3
408 define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
409 ; CHECK-SD-LABEL: uaddw2_8h:
410 ; CHECK-SD: // %bb.0:
411 ; CHECK-SD-NEXT: ldr q0, [x0]
412 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
413 ; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b
416 ; CHECK-GI-LABEL: uaddw2_8h:
417 ; CHECK-GI: // %bb.0:
418 ; CHECK-GI-NEXT: ldr q0, [x0]
419 ; CHECK-GI-NEXT: ldr q1, [x1]
420 ; CHECK-GI-NEXT: uaddw2 v0.8h, v0.8h, v1.16b
422 %tmp1 = load <8 x i16>, ptr %A
424 %tmp2 = load <16 x i8>, ptr %B
425 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
426 %ext2 = zext <8 x i8> %high2 to <8 x i16>
428 %res = add <8 x i16> %tmp1, %ext2
432 define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
433 ; CHECK-SD-LABEL: uaddw2_4s:
434 ; CHECK-SD: // %bb.0:
435 ; CHECK-SD-NEXT: ldr q0, [x0]
436 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
437 ; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v1.4h
440 ; CHECK-GI-LABEL: uaddw2_4s:
441 ; CHECK-GI: // %bb.0:
442 ; CHECK-GI-NEXT: ldr q0, [x0]
443 ; CHECK-GI-NEXT: ldr q1, [x1]
444 ; CHECK-GI-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
446 %tmp1 = load <4 x i32>, ptr %A
448 %tmp2 = load <8 x i16>, ptr %B
449 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
450 %ext2 = zext <4 x i16> %high2 to <4 x i32>
452 %res = add <4 x i32> %tmp1, %ext2
456 define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
457 ; CHECK-SD-LABEL: uaddw2_2d:
458 ; CHECK-SD: // %bb.0:
459 ; CHECK-SD-NEXT: ldr q0, [x0]
460 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
461 ; CHECK-SD-NEXT: uaddw v0.2d, v0.2d, v1.2s
464 ; CHECK-GI-LABEL: uaddw2_2d:
465 ; CHECK-GI: // %bb.0:
466 ; CHECK-GI-NEXT: ldr q0, [x0]
467 ; CHECK-GI-NEXT: ldr q1, [x1]
468 ; CHECK-GI-NEXT: uaddw2 v0.2d, v0.2d, v1.4s
470 %tmp1 = load <2 x i64>, ptr %A
472 %tmp2 = load <4 x i32>, ptr %B
473 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
474 %ext2 = zext <2 x i32> %high2 to <2 x i64>
476 %res = add <2 x i64> %tmp1, %ext2
480 define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
481 ; CHECK-LABEL: saddw8h:
483 ; CHECK-NEXT: ldr q0, [x0]
484 ; CHECK-NEXT: ldr d1, [x1]
485 ; CHECK-NEXT: saddw v0.8h, v0.8h, v1.8b
487 %tmp1 = load <8 x i16>, ptr %A
488 %tmp2 = load <8 x i8>, ptr %B
489 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
490 %tmp4 = add <8 x i16> %tmp1, %tmp3
494 define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
495 ; CHECK-LABEL: saddw4s:
497 ; CHECK-NEXT: ldr q0, [x0]
498 ; CHECK-NEXT: ldr d1, [x1]
499 ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
501 %tmp1 = load <4 x i32>, ptr %A
502 %tmp2 = load <4 x i16>, ptr %B
503 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
504 %tmp4 = add <4 x i32> %tmp1, %tmp3
508 define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
509 ; CHECK-LABEL: saddw2d:
511 ; CHECK-NEXT: ldr q0, [x0]
512 ; CHECK-NEXT: ldr d1, [x1]
513 ; CHECK-NEXT: saddw v0.2d, v0.2d, v1.2s
515 %tmp1 = load <2 x i64>, ptr %A
516 %tmp2 = load <2 x i32>, ptr %B
517 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
518 %tmp4 = add <2 x i64> %tmp1, %tmp3
522 define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
523 ; CHECK-SD-LABEL: saddw2_8h:
524 ; CHECK-SD: // %bb.0:
525 ; CHECK-SD-NEXT: ldr q0, [x0]
526 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
527 ; CHECK-SD-NEXT: saddw v0.8h, v0.8h, v1.8b
530 ; CHECK-GI-LABEL: saddw2_8h:
531 ; CHECK-GI: // %bb.0:
532 ; CHECK-GI-NEXT: ldr q0, [x0]
533 ; CHECK-GI-NEXT: ldr q1, [x1]
534 ; CHECK-GI-NEXT: saddw2 v0.8h, v0.8h, v1.16b
536 %tmp1 = load <8 x i16>, ptr %A
538 %tmp2 = load <16 x i8>, ptr %B
539 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
540 %ext2 = sext <8 x i8> %high2 to <8 x i16>
542 %res = add <8 x i16> %tmp1, %ext2
546 define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
547 ; CHECK-SD-LABEL: saddw2_4s:
548 ; CHECK-SD: // %bb.0:
549 ; CHECK-SD-NEXT: ldr q0, [x0]
550 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
551 ; CHECK-SD-NEXT: saddw v0.4s, v0.4s, v1.4h
554 ; CHECK-GI-LABEL: saddw2_4s:
555 ; CHECK-GI: // %bb.0:
556 ; CHECK-GI-NEXT: ldr q0, [x0]
557 ; CHECK-GI-NEXT: ldr q1, [x1]
558 ; CHECK-GI-NEXT: saddw2 v0.4s, v0.4s, v1.8h
560 %tmp1 = load <4 x i32>, ptr %A
562 %tmp2 = load <8 x i16>, ptr %B
563 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
564 %ext2 = sext <4 x i16> %high2 to <4 x i32>
566 %res = add <4 x i32> %tmp1, %ext2
570 define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
571 ; CHECK-SD-LABEL: saddw2_2d:
572 ; CHECK-SD: // %bb.0:
573 ; CHECK-SD-NEXT: ldr q0, [x0]
574 ; CHECK-SD-NEXT: ldr d1, [x1, #8]
575 ; CHECK-SD-NEXT: saddw v0.2d, v0.2d, v1.2s
578 ; CHECK-GI-LABEL: saddw2_2d:
579 ; CHECK-GI: // %bb.0:
580 ; CHECK-GI-NEXT: ldr q0, [x0]
581 ; CHECK-GI-NEXT: ldr q1, [x1]
582 ; CHECK-GI-NEXT: saddw2 v0.2d, v0.2d, v1.4s
584 %tmp1 = load <2 x i64>, ptr %A
586 %tmp2 = load <4 x i32>, ptr %B
587 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
588 %ext2 = sext <2 x i32> %high2 to <2 x i64>
590 %res = add <2 x i64> %tmp1, %ext2
594 define <4 x i16> @saddlp4h(ptr %A) nounwind {
595 ; CHECK-LABEL: saddlp4h:
597 ; CHECK-NEXT: ldr d0, [x0]
598 ; CHECK-NEXT: saddlp v0.4h, v0.8b
600 %tmp1 = load <8 x i8>, ptr %A
601 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
605 define <2 x i32> @saddlp2s(ptr %A) nounwind {
606 ; CHECK-LABEL: saddlp2s:
608 ; CHECK-NEXT: ldr d0, [x0]
609 ; CHECK-NEXT: saddlp v0.2s, v0.4h
611 %tmp1 = load <4 x i16>, ptr %A
612 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
616 define <1 x i64> @saddlp1d(ptr %A) nounwind {
617 ; CHECK-LABEL: saddlp1d:
619 ; CHECK-NEXT: ldr d0, [x0]
620 ; CHECK-NEXT: saddlp v0.1d, v0.2s
622 %tmp1 = load <2 x i32>, ptr %A
623 %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
627 define <8 x i16> @saddlp8h(ptr %A) nounwind {
628 ; CHECK-LABEL: saddlp8h:
630 ; CHECK-NEXT: ldr q0, [x0]
631 ; CHECK-NEXT: saddlp v0.8h, v0.16b
633 %tmp1 = load <16 x i8>, ptr %A
634 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
638 define <4 x i32> @saddlp4s(ptr %A) nounwind {
639 ; CHECK-LABEL: saddlp4s:
641 ; CHECK-NEXT: ldr q0, [x0]
642 ; CHECK-NEXT: saddlp v0.4s, v0.8h
644 %tmp1 = load <8 x i16>, ptr %A
645 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
649 define <2 x i64> @saddlp2d(ptr %A) nounwind {
650 ; CHECK-LABEL: saddlp2d:
652 ; CHECK-NEXT: ldr q0, [x0]
653 ; CHECK-NEXT: saddlp v0.2d, v0.4s
655 %tmp1 = load <4 x i32>, ptr %A
656 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
660 declare <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
661 declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
662 declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
664 declare <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
665 declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
666 declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
668 define <4 x i16> @uaddlp4h(ptr %A) nounwind {
669 ; CHECK-LABEL: uaddlp4h:
671 ; CHECK-NEXT: ldr d0, [x0]
672 ; CHECK-NEXT: uaddlp v0.4h, v0.8b
674 %tmp1 = load <8 x i8>, ptr %A
675 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
679 define <2 x i32> @uaddlp2s(ptr %A) nounwind {
680 ; CHECK-LABEL: uaddlp2s:
682 ; CHECK-NEXT: ldr d0, [x0]
683 ; CHECK-NEXT: uaddlp v0.2s, v0.4h
685 %tmp1 = load <4 x i16>, ptr %A
686 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
690 define <1 x i64> @uaddlp1d(ptr %A) nounwind {
691 ; CHECK-LABEL: uaddlp1d:
693 ; CHECK-NEXT: ldr d0, [x0]
694 ; CHECK-NEXT: uaddlp v0.1d, v0.2s
696 %tmp1 = load <2 x i32>, ptr %A
697 %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
701 define <8 x i16> @uaddlp8h(ptr %A) nounwind {
702 ; CHECK-LABEL: uaddlp8h:
704 ; CHECK-NEXT: ldr q0, [x0]
705 ; CHECK-NEXT: uaddlp v0.8h, v0.16b
707 %tmp1 = load <16 x i8>, ptr %A
708 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
712 define <4 x i32> @uaddlp4s(ptr %A) nounwind {
713 ; CHECK-LABEL: uaddlp4s:
715 ; CHECK-NEXT: ldr q0, [x0]
716 ; CHECK-NEXT: uaddlp v0.4s, v0.8h
718 %tmp1 = load <8 x i16>, ptr %A
719 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
723 define <2 x i64> @uaddlp2d(ptr %A) nounwind {
724 ; CHECK-LABEL: uaddlp2d:
726 ; CHECK-NEXT: ldr q0, [x0]
727 ; CHECK-NEXT: uaddlp v0.2d, v0.4s
729 %tmp1 = load <4 x i32>, ptr %A
730 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
734 declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
735 declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
736 declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
738 declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
739 declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
740 declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
742 define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
743 ; CHECK-LABEL: sadalp4h:
745 ; CHECK-NEXT: ldr d1, [x0]
746 ; CHECK-NEXT: ldr d0, [x1]
747 ; CHECK-NEXT: sadalp v0.4h, v1.8b
749 %tmp1 = load <8 x i8>, ptr %A
750 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
751 %tmp4 = load <4 x i16>, ptr %B
752 %tmp5 = add <4 x i16> %tmp3, %tmp4
756 define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
757 ; CHECK-LABEL: sadalp2s:
759 ; CHECK-NEXT: ldr d1, [x0]
760 ; CHECK-NEXT: ldr d0, [x1]
761 ; CHECK-NEXT: sadalp v0.2s, v1.4h
763 %tmp1 = load <4 x i16>, ptr %A
764 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
765 %tmp4 = load <2 x i32>, ptr %B
766 %tmp5 = add <2 x i32> %tmp3, %tmp4
770 define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
771 ; CHECK-LABEL: sadalp8h:
773 ; CHECK-NEXT: ldr q1, [x0]
774 ; CHECK-NEXT: ldr q0, [x1]
775 ; CHECK-NEXT: sadalp v0.8h, v1.16b
777 %tmp1 = load <16 x i8>, ptr %A
778 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
779 %tmp4 = load <8 x i16>, ptr %B
780 %tmp5 = add <8 x i16> %tmp3, %tmp4
784 define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
785 ; CHECK-LABEL: sadalp4s:
787 ; CHECK-NEXT: ldr q1, [x0]
788 ; CHECK-NEXT: ldr q0, [x1]
789 ; CHECK-NEXT: sadalp v0.4s, v1.8h
791 %tmp1 = load <8 x i16>, ptr %A
792 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
793 %tmp4 = load <4 x i32>, ptr %B
794 %tmp5 = add <4 x i32> %tmp3, %tmp4
798 define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
799 ; CHECK-LABEL: sadalp2d:
801 ; CHECK-NEXT: ldr q1, [x0]
802 ; CHECK-NEXT: ldr q0, [x1]
803 ; CHECK-NEXT: sadalp v0.2d, v1.4s
805 %tmp1 = load <4 x i32>, ptr %A
806 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
807 %tmp4 = load <2 x i64>, ptr %B
808 %tmp5 = add <2 x i64> %tmp3, %tmp4
812 define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
813 ; CHECK-LABEL: uadalp4h:
815 ; CHECK-NEXT: ldr d1, [x0]
816 ; CHECK-NEXT: ldr d0, [x1]
817 ; CHECK-NEXT: uadalp v0.4h, v1.8b
819 %tmp1 = load <8 x i8>, ptr %A
820 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
821 %tmp4 = load <4 x i16>, ptr %B
822 %tmp5 = add <4 x i16> %tmp3, %tmp4
826 define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
827 ; CHECK-LABEL: uadalp2s:
829 ; CHECK-NEXT: ldr d1, [x0]
830 ; CHECK-NEXT: ldr d0, [x1]
831 ; CHECK-NEXT: uadalp v0.2s, v1.4h
833 %tmp1 = load <4 x i16>, ptr %A
834 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
835 %tmp4 = load <2 x i32>, ptr %B
836 %tmp5 = add <2 x i32> %tmp3, %tmp4
840 define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
841 ; CHECK-LABEL: uadalp8h:
843 ; CHECK-NEXT: ldr q1, [x0]
844 ; CHECK-NEXT: ldr q0, [x1]
845 ; CHECK-NEXT: uadalp v0.8h, v1.16b
847 %tmp1 = load <16 x i8>, ptr %A
848 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
849 %tmp4 = load <8 x i16>, ptr %B
850 %tmp5 = add <8 x i16> %tmp3, %tmp4
854 define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
855 ; CHECK-LABEL: uadalp4s:
857 ; CHECK-NEXT: ldr q1, [x0]
858 ; CHECK-NEXT: ldr q0, [x1]
859 ; CHECK-NEXT: uadalp v0.4s, v1.8h
861 %tmp1 = load <8 x i16>, ptr %A
862 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
863 %tmp4 = load <4 x i32>, ptr %B
864 %tmp5 = add <4 x i32> %tmp3, %tmp4
868 define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
869 ; CHECK-LABEL: uadalp2d:
871 ; CHECK-NEXT: ldr q1, [x0]
872 ; CHECK-NEXT: ldr q0, [x1]
873 ; CHECK-NEXT: uadalp v0.2d, v1.4s
875 %tmp1 = load <4 x i32>, ptr %A
876 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
877 %tmp4 = load <2 x i64>, ptr %B
878 %tmp5 = add <2 x i64> %tmp3, %tmp4
882 define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
883 ; CHECK-LABEL: addp_8b:
885 ; CHECK-NEXT: ldr d0, [x0]
886 ; CHECK-NEXT: ldr d1, [x1]
887 ; CHECK-NEXT: addp v0.8b, v0.8b, v1.8b
889 %tmp1 = load <8 x i8>, ptr %A
890 %tmp2 = load <8 x i8>, ptr %B
891 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
895 define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
896 ; CHECK-LABEL: addp_16b:
898 ; CHECK-NEXT: ldr q0, [x0]
899 ; CHECK-NEXT: ldr q1, [x1]
900 ; CHECK-NEXT: addp v0.16b, v0.16b, v1.16b
902 %tmp1 = load <16 x i8>, ptr %A
903 %tmp2 = load <16 x i8>, ptr %B
904 %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
908 define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
909 ; CHECK-LABEL: addp_4h:
911 ; CHECK-NEXT: ldr d0, [x0]
912 ; CHECK-NEXT: ldr d1, [x1]
913 ; CHECK-NEXT: addp v0.4h, v0.4h, v1.4h
915 %tmp1 = load <4 x i16>, ptr %A
916 %tmp2 = load <4 x i16>, ptr %B
917 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
921 define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
922 ; CHECK-LABEL: addp_8h:
924 ; CHECK-NEXT: ldr q0, [x0]
925 ; CHECK-NEXT: ldr q1, [x1]
926 ; CHECK-NEXT: addp v0.8h, v0.8h, v1.8h
928 %tmp1 = load <8 x i16>, ptr %A
929 %tmp2 = load <8 x i16>, ptr %B
930 %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
934 define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
935 ; CHECK-LABEL: addp_2s:
937 ; CHECK-NEXT: ldr d0, [x0]
938 ; CHECK-NEXT: ldr d1, [x1]
939 ; CHECK-NEXT: addp v0.2s, v0.2s, v1.2s
941 %tmp1 = load <2 x i32>, ptr %A
942 %tmp2 = load <2 x i32>, ptr %B
943 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
947 define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
948 ; CHECK-LABEL: addp_4s:
950 ; CHECK-NEXT: ldr q0, [x0]
951 ; CHECK-NEXT: ldr q1, [x1]
952 ; CHECK-NEXT: addp v0.4s, v0.4s, v1.4s
954 %tmp1 = load <4 x i32>, ptr %A
955 %tmp2 = load <4 x i32>, ptr %B
956 %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
960 define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind {
961 ; CHECK-LABEL: addp_2d:
963 ; CHECK-NEXT: ldr q0, [x0]
964 ; CHECK-NEXT: ldr q1, [x1]
965 ; CHECK-NEXT: addp v0.2d, v0.2d, v1.2d
967 %tmp1 = load <2 x i64>, ptr %A
968 %tmp2 = load <2 x i64>, ptr %B
969 %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
973 declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
974 declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
975 declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
976 declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
977 declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
978 declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
979 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
981 define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
982 ; CHECK-LABEL: faddp_2s:
984 ; CHECK-NEXT: ldr d0, [x0]
985 ; CHECK-NEXT: ldr d1, [x1]
986 ; CHECK-NEXT: faddp v0.2s, v0.2s, v1.2s
988 %tmp1 = load <2 x float>, ptr %A
989 %tmp2 = load <2 x float>, ptr %B
990 %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
991 ret <2 x float> %tmp3
994 define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
995 ; CHECK-LABEL: faddp_4s:
997 ; CHECK-NEXT: ldr q0, [x0]
998 ; CHECK-NEXT: ldr q1, [x1]
999 ; CHECK-NEXT: faddp v0.4s, v0.4s, v1.4s
1001 %tmp1 = load <4 x float>, ptr %A
1002 %tmp2 = load <4 x float>, ptr %B
1003 %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
1004 ret <4 x float> %tmp3
1007 define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind {
1008 ; CHECK-LABEL: faddp_2d:
1010 ; CHECK-NEXT: ldr q0, [x0]
1011 ; CHECK-NEXT: ldr q1, [x1]
1012 ; CHECK-NEXT: faddp v0.2d, v0.2d, v1.2d
1014 %tmp1 = load <2 x double>, ptr %A
1015 %tmp2 = load <2 x double>, ptr %B
1016 %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
1017 ret <2 x double> %tmp3
1020 declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone
1021 declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone
1022 declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
1024 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
1025 ; CHECK-LABEL: uaddl_duprhs:
1027 ; CHECK-NEXT: dup v1.2s, w0
1028 ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
1030 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1031 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1033 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1035 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1036 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1038 %res = add <2 x i64> %lhs.ext, %rhs.ext
1042 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
1043 ; CHECK-SD-LABEL: uaddl2_duprhs:
1044 ; CHECK-SD: // %bb.0:
1045 ; CHECK-SD-NEXT: dup v1.4s, w0
1046 ; CHECK-SD-NEXT: uaddl2 v0.2d, v0.4s, v1.4s
1047 ; CHECK-SD-NEXT: ret
1049 ; CHECK-GI-LABEL: uaddl2_duprhs:
1050 ; CHECK-GI: // %bb.0:
1051 ; CHECK-GI-NEXT: dup v1.2s, w0
1052 ; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
1053 ; CHECK-GI-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
1054 ; CHECK-GI-NEXT: ret
1055 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1056 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1058 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1060 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1061 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1063 %res = add <2 x i64> %lhs.ext, %rhs.ext
1067 define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
1068 ; CHECK-LABEL: saddl_duplhs:
1070 ; CHECK-NEXT: dup v1.2s, w0
1071 ; CHECK-NEXT: saddl v0.2d, v1.2s, v0.2s
1073 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1074 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1076 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1078 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1079 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1081 %res = add <2 x i64> %lhs.ext, %rhs.ext
1085 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
1086 ; CHECK-SD-LABEL: saddl2_duplhs:
1087 ; CHECK-SD: // %bb.0:
1088 ; CHECK-SD-NEXT: dup v1.4s, w0
1089 ; CHECK-SD-NEXT: saddl2 v0.2d, v1.4s, v0.4s
1090 ; CHECK-SD-NEXT: ret
1092 ; CHECK-GI-LABEL: saddl2_duplhs:
1093 ; CHECK-GI: // %bb.0:
1094 ; CHECK-GI-NEXT: dup v1.2s, w0
1095 ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
1096 ; CHECK-GI-NEXT: saddw2 v0.2d, v1.2d, v0.4s
1097 ; CHECK-GI-NEXT: ret
1098 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1099 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1101 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1103 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1104 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1106 %res = add <2 x i64> %lhs.ext, %rhs.ext
1110 define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
1111 ; CHECK-LABEL: usubl_duprhs:
1113 ; CHECK-NEXT: dup v1.2s, w0
1114 ; CHECK-NEXT: usubl v0.2d, v0.2s, v1.2s
1116 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1117 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1119 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1121 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1122 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1124 %res = sub <2 x i64> %lhs.ext, %rhs.ext
1128 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
1129 ; CHECK-SD-LABEL: usubl2_duprhs:
1130 ; CHECK-SD: // %bb.0:
1131 ; CHECK-SD-NEXT: dup v1.4s, w0
1132 ; CHECK-SD-NEXT: usubl2 v0.2d, v0.4s, v1.4s
1133 ; CHECK-SD-NEXT: ret
1135 ; CHECK-GI-LABEL: usubl2_duprhs:
1136 ; CHECK-GI: // %bb.0:
1137 ; CHECK-GI-NEXT: dup v1.2s, w0
1138 ; CHECK-GI-NEXT: mov d0, v0.d[1]
1139 ; CHECK-GI-NEXT: usubl v0.2d, v0.2s, v1.2s
1140 ; CHECK-GI-NEXT: ret
1141 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1142 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1144 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1146 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1147 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1149 %res = sub <2 x i64> %lhs.ext, %rhs.ext
1153 define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
1154 ; CHECK-LABEL: ssubl_duplhs:
1156 ; CHECK-NEXT: dup v1.2s, w0
1157 ; CHECK-NEXT: ssubl v0.2d, v1.2s, v0.2s
1159 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1160 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1162 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1164 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1165 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1167 %res = sub <2 x i64> %lhs.ext, %rhs.ext
1171 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
1172 ; CHECK-SD-LABEL: ssubl2_duplhs:
1173 ; CHECK-SD: // %bb.0:
1174 ; CHECK-SD-NEXT: dup v1.4s, w0
1175 ; CHECK-SD-NEXT: ssubl2 v0.2d, v1.4s, v0.4s
1176 ; CHECK-SD-NEXT: ret
1178 ; CHECK-GI-LABEL: ssubl2_duplhs:
1179 ; CHECK-GI: // %bb.0:
1180 ; CHECK-GI-NEXT: dup v1.2s, w0
1181 ; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
1182 ; CHECK-GI-NEXT: ssubw2 v0.2d, v1.2d, v0.4s
1183 ; CHECK-GI-NEXT: ret
1184 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1185 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1187 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1189 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1190 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1192 %res = sub <2 x i64> %lhs.ext, %rhs.ext
1196 define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
1197 ; CHECK-SD-LABEL: addhn8b_natural:
1198 ; CHECK-SD: // %bb.0:
1199 ; CHECK-SD-NEXT: ldr q0, [x0]
1200 ; CHECK-SD-NEXT: ldr q1, [x1]
1201 ; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h
1202 ; CHECK-SD-NEXT: ret
1204 ; CHECK-GI-LABEL: addhn8b_natural:
1205 ; CHECK-GI: // %bb.0:
1206 ; CHECK-GI-NEXT: ldr q0, [x0]
1207 ; CHECK-GI-NEXT: ldr q1, [x1]
1208 ; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h
1209 ; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8
1210 ; CHECK-GI-NEXT: ret
1211 %tmp1 = load <8 x i16>, ptr %A
1212 %tmp2 = load <8 x i16>, ptr %B
1213 %sum = add <8 x i16> %tmp1, %tmp2
1214 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1215 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1216 ret <8 x i8> %narrowed
1219 define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
1220 ; CHECK-SD-LABEL: addhn4h_natural:
1221 ; CHECK-SD: // %bb.0:
1222 ; CHECK-SD-NEXT: ldr q0, [x0]
1223 ; CHECK-SD-NEXT: ldr q1, [x1]
1224 ; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s
1225 ; CHECK-SD-NEXT: ret
1227 ; CHECK-GI-LABEL: addhn4h_natural:
1228 ; CHECK-GI: // %bb.0:
1229 ; CHECK-GI-NEXT: ldr q0, [x0]
1230 ; CHECK-GI-NEXT: ldr q1, [x1]
1231 ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
1232 ; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16
1233 ; CHECK-GI-NEXT: ret
1234 %tmp1 = load <4 x i32>, ptr %A
1235 %tmp2 = load <4 x i32>, ptr %B
1236 %sum = add <4 x i32> %tmp1, %tmp2
1237 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1238 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1239 ret <4 x i16> %narrowed
1242 define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
1243 ; CHECK-SD-LABEL: addhn2s_natural:
1244 ; CHECK-SD: // %bb.0:
1245 ; CHECK-SD-NEXT: ldr q0, [x0]
1246 ; CHECK-SD-NEXT: ldr q1, [x1]
1247 ; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d
1248 ; CHECK-SD-NEXT: ret
1250 ; CHECK-GI-LABEL: addhn2s_natural:
1251 ; CHECK-GI: // %bb.0:
1252 ; CHECK-GI-NEXT: ldr q0, [x0]
1253 ; CHECK-GI-NEXT: ldr q1, [x1]
1254 ; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
1255 ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32
1256 ; CHECK-GI-NEXT: ret
1257 %tmp1 = load <2 x i64>, ptr %A
1258 %tmp2 = load <2 x i64>, ptr %B
1259 %sum = add <2 x i64> %tmp1, %tmp2
1260 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
1261 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1262 ret <2 x i32> %narrowed
1265 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
1266 ; CHECK-SD-LABEL: addhn2_16b_natural:
1267 ; CHECK-SD: // %bb.0:
1268 ; CHECK-SD-NEXT: ldr q1, [x0]
1269 ; CHECK-SD-NEXT: ldr q2, [x1]
1270 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1271 ; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h
1272 ; CHECK-SD-NEXT: ret
1274 ; CHECK-GI-LABEL: addhn2_16b_natural:
1275 ; CHECK-GI: // %bb.0:
1276 ; CHECK-GI-NEXT: ldr q1, [x0]
1277 ; CHECK-GI-NEXT: ldr q2, [x1]
1278 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1279 ; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h
1280 ; CHECK-GI-NEXT: shrn2 v0.16b, v1.8h, #8
1281 ; CHECK-GI-NEXT: ret
1282 %tmp1 = load <8 x i16>, ptr %A
1283 %tmp2 = load <8 x i16>, ptr %B
1284 %sum = add <8 x i16> %tmp1, %tmp2
1285 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1286 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1287 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1291 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
1292 ; CHECK-SD-LABEL: addhn2_8h_natural:
1293 ; CHECK-SD: // %bb.0:
1294 ; CHECK-SD-NEXT: ldr q1, [x0]
1295 ; CHECK-SD-NEXT: ldr q2, [x1]
1296 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1297 ; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s
1298 ; CHECK-SD-NEXT: ret
1300 ; CHECK-GI-LABEL: addhn2_8h_natural:
1301 ; CHECK-GI: // %bb.0:
1302 ; CHECK-GI-NEXT: ldr q1, [x0]
1303 ; CHECK-GI-NEXT: ldr q2, [x1]
1304 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1305 ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
1306 ; CHECK-GI-NEXT: shrn2 v0.8h, v1.4s, #16
1307 ; CHECK-GI-NEXT: ret
1308 %tmp1 = load <4 x i32>, ptr %A
1309 %tmp2 = load <4 x i32>, ptr %B
1310 %sum = add <4 x i32> %tmp1, %tmp2
1311 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1312 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1313 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1317 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
1318 ; CHECK-SD-LABEL: addhn2_4s_natural:
1319 ; CHECK-SD: // %bb.0:
1320 ; CHECK-SD-NEXT: ldr q1, [x0]
1321 ; CHECK-SD-NEXT: ldr q2, [x1]
1322 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1323 ; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d
1324 ; CHECK-SD-NEXT: ret
1326 ; CHECK-GI-LABEL: addhn2_4s_natural:
1327 ; CHECK-GI: // %bb.0:
1328 ; CHECK-GI-NEXT: ldr q1, [x0]
1329 ; CHECK-GI-NEXT: ldr q2, [x1]
1330 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1331 ; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d
1332 ; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32
1333 ; CHECK-GI-NEXT: ret
1334 %tmp1 = load <2 x i64>, ptr %A
1335 %tmp2 = load <2 x i64>, ptr %B
1336 %sum = add <2 x i64> %tmp1, %tmp2
1337 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
1338 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1339 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1343 define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
1344 ; CHECK-SD-LABEL: addhn_addhn2_4s:
1345 ; CHECK-SD: // %bb.0:
1346 ; CHECK-SD-NEXT: ldr q1, [x0]
1347 ; CHECK-SD-NEXT: ldr q2, [x1]
1348 ; CHECK-SD-NEXT: addhn v0.2s, v1.2d, v2.2d
1349 ; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d
1350 ; CHECK-SD-NEXT: ret
1352 ; CHECK-GI-LABEL: addhn_addhn2_4s:
1353 ; CHECK-GI: // %bb.0:
1354 ; CHECK-GI-NEXT: ldr q0, [x0]
1355 ; CHECK-GI-NEXT: ldr q1, [x1]
1356 ; CHECK-GI-NEXT: add v1.2d, v0.2d, v1.2d
1357 ; CHECK-GI-NEXT: shrn v0.2s, v1.2d, #32
1358 ; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32
1359 ; CHECK-GI-NEXT: ret
1360 %tmp1 = load <2 x i64>, ptr %A
1361 %tmp2 = load <2 x i64>, ptr %B
1362 %sum1 = add <2 x i64> %tmp1, %tmp2
1363 %low_bits = lshr <2 x i64> %sum1, <i64 32, i64 32>
1364 %narrowed1 = trunc <2 x i64> %low_bits to <2 x i32>
1365 %tmp3 = load <2 x i64>, ptr %C
1366 %tmp4 = load <2 x i64>, ptr %D
1367 %sum2 = add <2 x i64> %tmp3, %tmp4
1368 %high_bits = lshr <2 x i64> %sum1, <i64 32, i64 32>
1369 %narrowed2 = trunc <2 x i64> %high_bits to <2 x i32>
1370 %res = shufflevector <2 x i32> %narrowed1, <2 x i32> %narrowed2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1374 define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
1375 ; CHECK-SD-LABEL: subhn8b_natural:
1376 ; CHECK-SD: // %bb.0:
1377 ; CHECK-SD-NEXT: ldr q0, [x0]
1378 ; CHECK-SD-NEXT: ldr q1, [x1]
1379 ; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h
1380 ; CHECK-SD-NEXT: ret
1382 ; CHECK-GI-LABEL: subhn8b_natural:
1383 ; CHECK-GI: // %bb.0:
1384 ; CHECK-GI-NEXT: ldr q0, [x0]
1385 ; CHECK-GI-NEXT: ldr q1, [x1]
1386 ; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h
1387 ; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8
1388 ; CHECK-GI-NEXT: ret
1389 %tmp1 = load <8 x i16>, ptr %A
1390 %tmp2 = load <8 x i16>, ptr %B
1391 %diff = sub <8 x i16> %tmp1, %tmp2
1392 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1393 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1394 ret <8 x i8> %narrowed
1397 define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
1398 ; CHECK-SD-LABEL: subhn4h_natural:
1399 ; CHECK-SD: // %bb.0:
1400 ; CHECK-SD-NEXT: ldr q0, [x0]
1401 ; CHECK-SD-NEXT: ldr q1, [x1]
1402 ; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s
1403 ; CHECK-SD-NEXT: ret
1405 ; CHECK-GI-LABEL: subhn4h_natural:
1406 ; CHECK-GI: // %bb.0:
1407 ; CHECK-GI-NEXT: ldr q0, [x0]
1408 ; CHECK-GI-NEXT: ldr q1, [x1]
1409 ; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s
1410 ; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16
1411 ; CHECK-GI-NEXT: ret
1412 %tmp1 = load <4 x i32>, ptr %A
1413 %tmp2 = load <4 x i32>, ptr %B
1414 %diff = sub <4 x i32> %tmp1, %tmp2
1415 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
1416 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1417 ret <4 x i16> %narrowed
1420 define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
1421 ; CHECK-SD-LABEL: subhn2s_natural:
1422 ; CHECK-SD: // %bb.0:
1423 ; CHECK-SD-NEXT: ldr q0, [x0]
1424 ; CHECK-SD-NEXT: ldr q1, [x1]
1425 ; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d
1426 ; CHECK-SD-NEXT: ret
1428 ; CHECK-GI-LABEL: subhn2s_natural:
1429 ; CHECK-GI: // %bb.0:
1430 ; CHECK-GI-NEXT: ldr q0, [x0]
1431 ; CHECK-GI-NEXT: ldr q1, [x1]
1432 ; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d
1433 ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32
1434 ; CHECK-GI-NEXT: ret
1435 %tmp1 = load <2 x i64>, ptr %A
1436 %tmp2 = load <2 x i64>, ptr %B
1437 %diff = sub <2 x i64> %tmp1, %tmp2
1438 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
1439 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1440 ret <2 x i32> %narrowed
1443 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
1444 ; CHECK-SD-LABEL: subhn2_16b_natural:
1445 ; CHECK-SD: // %bb.0:
1446 ; CHECK-SD-NEXT: ldr q1, [x0]
1447 ; CHECK-SD-NEXT: ldr q2, [x1]
1448 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1449 ; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h
1450 ; CHECK-SD-NEXT: ret
1452 ; CHECK-GI-LABEL: subhn2_16b_natural:
1453 ; CHECK-GI: // %bb.0:
1454 ; CHECK-GI-NEXT: ldr q1, [x0]
1455 ; CHECK-GI-NEXT: ldr q2, [x1]
1456 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1457 ; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h
1458 ; CHECK-GI-NEXT: shrn2 v0.16b, v1.8h, #8
1459 ; CHECK-GI-NEXT: ret
1460 %tmp1 = load <8 x i16>, ptr %A
1461 %tmp2 = load <8 x i16>, ptr %B
1462 %diff = sub <8 x i16> %tmp1, %tmp2
1463 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1464 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1465 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1469 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
1470 ; CHECK-SD-LABEL: subhn2_8h_natural:
1471 ; CHECK-SD: // %bb.0:
1472 ; CHECK-SD-NEXT: ldr q1, [x0]
1473 ; CHECK-SD-NEXT: ldr q2, [x1]
1474 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1475 ; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s
1476 ; CHECK-SD-NEXT: ret
1478 ; CHECK-GI-LABEL: subhn2_8h_natural:
1479 ; CHECK-GI: // %bb.0:
1480 ; CHECK-GI-NEXT: ldr q1, [x0]
1481 ; CHECK-GI-NEXT: ldr q2, [x1]
1482 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1483 ; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s
1484 ; CHECK-GI-NEXT: shrn2 v0.8h, v1.4s, #16
1485 ; CHECK-GI-NEXT: ret
1486 %tmp1 = load <4 x i32>, ptr %A
1487 %tmp2 = load <4 x i32>, ptr %B
1488 %diff = sub <4 x i32> %tmp1, %tmp2
1489 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
1490 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1491 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1495 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
1496 ; CHECK-SD-LABEL: subhn2_4s_natural:
1497 ; CHECK-SD: // %bb.0:
1498 ; CHECK-SD-NEXT: ldr q1, [x0]
1499 ; CHECK-SD-NEXT: ldr q2, [x1]
1500 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1501 ; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d
1502 ; CHECK-SD-NEXT: ret
1504 ; CHECK-GI-LABEL: subhn2_4s_natural:
1505 ; CHECK-GI: // %bb.0:
1506 ; CHECK-GI-NEXT: ldr q1, [x0]
1507 ; CHECK-GI-NEXT: ldr q2, [x1]
1508 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1509 ; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d
1510 ; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32
1511 ; CHECK-GI-NEXT: ret
1512 %tmp1 = load <2 x i64>, ptr %A
1513 %tmp2 = load <2 x i64>, ptr %B
1514 %diff = sub <2 x i64> %tmp1, %tmp2
1515 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
1516 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1517 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>