1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
3 define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
6 %tmp1 = load <8 x i16>, <8 x i16>* %A
7 %tmp2 = load <8 x i16>, <8 x i16>* %B
8 %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
12 define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
13 ;CHECK-LABEL: subhn4h:
15 %tmp1 = load <4 x i32>, <4 x i32>* %A
16 %tmp2 = load <4 x i32>, <4 x i32>* %B
17 %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
21 define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
22 ;CHECK-LABEL: subhn2s:
24 %tmp1 = load <2 x i64>, <2 x i64>* %A
25 %tmp2 = load <2 x i64>, <2 x i64>* %B
26 %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
30 define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
31 ;CHECK-LABEL: subhn2_16b:
33 ;CHECK-NEXT: subhn2.16b
34 %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
35 %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
36 %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40 define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
41 ;CHECK-LABEL: subhn2_8h:
43 ;CHECK-NEXT: subhn2.8h
44 %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
45 %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
46 %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
50 define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
51 ;CHECK-LABEL: subhn2_4s:
53 ;CHECK-NEXT: subhn2.4s
54 %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
55 %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
56 %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
60 declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
61 declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
62 declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
64 define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
65 ;CHECK-LABEL: rsubhn8b:
67 %tmp1 = load <8 x i16>, <8 x i16>* %A
68 %tmp2 = load <8 x i16>, <8 x i16>* %B
69 %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
73 define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
74 ;CHECK-LABEL: rsubhn4h:
76 %tmp1 = load <4 x i32>, <4 x i32>* %A
77 %tmp2 = load <4 x i32>, <4 x i32>* %B
78 %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
82 define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
83 ;CHECK-LABEL: rsubhn2s:
85 %tmp1 = load <2 x i64>, <2 x i64>* %A
86 %tmp2 = load <2 x i64>, <2 x i64>* %B
87 %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
91 define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
92 ;CHECK-LABEL: rsubhn2_16b:
94 ;CHECK-NEXT: rsubhn2.16b
95 %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
96 %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
97 %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
101 define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
102 ;CHECK-LABEL: rsubhn2_8h:
104 ;CHECK-NEXT: rsubhn2.8h
105 %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
106 %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
107 %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
111 define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
112 ;CHECK-LABEL: rsubhn2_4s:
114 ;CHECK-NEXT: rsubhn2.4s
115 %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
116 %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
117 %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
121 declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
122 declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
123 declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
125 define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
126 ;CHECK-LABEL: ssubl8h:
128 %tmp1 = load <8 x i8>, <8 x i8>* %A
129 %tmp2 = load <8 x i8>, <8 x i8>* %B
130 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
131 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
132 %tmp5 = sub <8 x i16> %tmp3, %tmp4
136 define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
137 ;CHECK-LABEL: ssubl4s:
139 %tmp1 = load <4 x i16>, <4 x i16>* %A
140 %tmp2 = load <4 x i16>, <4 x i16>* %B
141 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
142 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
143 %tmp5 = sub <4 x i32> %tmp3, %tmp4
147 define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
148 ;CHECK-LABEL: ssubl2d:
150 %tmp1 = load <2 x i32>, <2 x i32>* %A
151 %tmp2 = load <2 x i32>, <2 x i32>* %B
152 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
153 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
154 %tmp5 = sub <2 x i64> %tmp3, %tmp4
158 define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
159 ;CHECK-LABEL: ssubl2_8h:
161 %tmp1 = load <16 x i8>, <16 x i8>* %A
162 %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
163 %ext1 = sext <8 x i8> %high1 to <8 x i16>
165 %tmp2 = load <16 x i8>, <16 x i8>* %B
166 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
167 %ext2 = sext <8 x i8> %high2 to <8 x i16>
169 %res = sub <8 x i16> %ext1, %ext2
173 define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
174 ;CHECK-LABEL: ssubl2_4s:
176 %tmp1 = load <8 x i16>, <8 x i16>* %A
177 %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
178 %ext1 = sext <4 x i16> %high1 to <4 x i32>
180 %tmp2 = load <8 x i16>, <8 x i16>* %B
181 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
182 %ext2 = sext <4 x i16> %high2 to <4 x i32>
184 %res = sub <4 x i32> %ext1, %ext2
188 define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
189 ;CHECK-LABEL: ssubl2_2d:
191 %tmp1 = load <4 x i32>, <4 x i32>* %A
192 %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
193 %ext1 = sext <2 x i32> %high1 to <2 x i64>
195 %tmp2 = load <4 x i32>, <4 x i32>* %B
196 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
197 %ext2 = sext <2 x i32> %high2 to <2 x i64>
199 %res = sub <2 x i64> %ext1, %ext2
203 define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
204 ;CHECK-LABEL: usubl8h:
206 %tmp1 = load <8 x i8>, <8 x i8>* %A
207 %tmp2 = load <8 x i8>, <8 x i8>* %B
208 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
209 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
210 %tmp5 = sub <8 x i16> %tmp3, %tmp4
214 define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
215 ;CHECK-LABEL: usubl4s:
217 %tmp1 = load <4 x i16>, <4 x i16>* %A
218 %tmp2 = load <4 x i16>, <4 x i16>* %B
219 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
220 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
221 %tmp5 = sub <4 x i32> %tmp3, %tmp4
225 define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
226 ;CHECK-LABEL: usubl2d:
228 %tmp1 = load <2 x i32>, <2 x i32>* %A
229 %tmp2 = load <2 x i32>, <2 x i32>* %B
230 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
231 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
232 %tmp5 = sub <2 x i64> %tmp3, %tmp4
236 define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
237 ;CHECK-LABEL: usubl2_8h:
239 %tmp1 = load <16 x i8>, <16 x i8>* %A
240 %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
241 %ext1 = zext <8 x i8> %high1 to <8 x i16>
243 %tmp2 = load <16 x i8>, <16 x i8>* %B
244 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
245 %ext2 = zext <8 x i8> %high2 to <8 x i16>
247 %res = sub <8 x i16> %ext1, %ext2
251 define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
252 ;CHECK-LABEL: usubl2_4s:
254 %tmp1 = load <8 x i16>, <8 x i16>* %A
255 %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
256 %ext1 = zext <4 x i16> %high1 to <4 x i32>
258 %tmp2 = load <8 x i16>, <8 x i16>* %B
259 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
260 %ext2 = zext <4 x i16> %high2 to <4 x i32>
262 %res = sub <4 x i32> %ext1, %ext2
266 define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
267 ;CHECK-LABEL: usubl2_2d:
269 %tmp1 = load <4 x i32>, <4 x i32>* %A
270 %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
271 %ext1 = zext <2 x i32> %high1 to <2 x i64>
273 %tmp2 = load <4 x i32>, <4 x i32>* %B
274 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
275 %ext2 = zext <2 x i32> %high2 to <2 x i64>
277 %res = sub <2 x i64> %ext1, %ext2
281 define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
282 ;CHECK-LABEL: ssubw8h:
284 %tmp1 = load <8 x i16>, <8 x i16>* %A
285 %tmp2 = load <8 x i8>, <8 x i8>* %B
286 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
287 %tmp4 = sub <8 x i16> %tmp1, %tmp3
291 define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
292 ;CHECK-LABEL: ssubw4s:
294 %tmp1 = load <4 x i32>, <4 x i32>* %A
295 %tmp2 = load <4 x i16>, <4 x i16>* %B
296 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
297 %tmp4 = sub <4 x i32> %tmp1, %tmp3
301 define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
302 ;CHECK-LABEL: ssubw2d:
304 %tmp1 = load <2 x i64>, <2 x i64>* %A
305 %tmp2 = load <2 x i32>, <2 x i32>* %B
306 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
307 %tmp4 = sub <2 x i64> %tmp1, %tmp3
311 define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
312 ;CHECK-LABEL: ssubw2_8h:
314 %tmp1 = load <8 x i16>, <8 x i16>* %A
316 %tmp2 = load <16 x i8>, <16 x i8>* %B
317 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
318 %ext2 = sext <8 x i8> %high2 to <8 x i16>
320 %res = sub <8 x i16> %tmp1, %ext2
324 define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
325 ;CHECK-LABEL: ssubw2_4s:
327 %tmp1 = load <4 x i32>, <4 x i32>* %A
329 %tmp2 = load <8 x i16>, <8 x i16>* %B
330 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
331 %ext2 = sext <4 x i16> %high2 to <4 x i32>
333 %res = sub <4 x i32> %tmp1, %ext2
337 define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
338 ;CHECK-LABEL: ssubw2_2d:
340 %tmp1 = load <2 x i64>, <2 x i64>* %A
342 %tmp2 = load <4 x i32>, <4 x i32>* %B
343 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
344 %ext2 = sext <2 x i32> %high2 to <2 x i64>
346 %res = sub <2 x i64> %tmp1, %ext2
350 define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
351 ;CHECK-LABEL: usubw8h:
353 %tmp1 = load <8 x i16>, <8 x i16>* %A
354 %tmp2 = load <8 x i8>, <8 x i8>* %B
355 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
356 %tmp4 = sub <8 x i16> %tmp1, %tmp3
360 define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
361 ;CHECK-LABEL: usubw4s:
363 %tmp1 = load <4 x i32>, <4 x i32>* %A
364 %tmp2 = load <4 x i16>, <4 x i16>* %B
365 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
366 %tmp4 = sub <4 x i32> %tmp1, %tmp3
370 define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
371 ;CHECK-LABEL: usubw2d:
373 %tmp1 = load <2 x i64>, <2 x i64>* %A
374 %tmp2 = load <2 x i32>, <2 x i32>* %B
375 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
376 %tmp4 = sub <2 x i64> %tmp1, %tmp3
380 define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
381 ;CHECK-LABEL: usubw2_8h:
383 %tmp1 = load <8 x i16>, <8 x i16>* %A
385 %tmp2 = load <16 x i8>, <16 x i8>* %B
386 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
387 %ext2 = zext <8 x i8> %high2 to <8 x i16>
389 %res = sub <8 x i16> %tmp1, %ext2
393 define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
394 ;CHECK-LABEL: usubw2_4s:
396 %tmp1 = load <4 x i32>, <4 x i32>* %A
398 %tmp2 = load <8 x i16>, <8 x i16>* %B
399 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
400 %ext2 = zext <4 x i16> %high2 to <4 x i32>
402 %res = sub <4 x i32> %tmp1, %ext2
406 define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
407 ;CHECK-LABEL: usubw2_2d:
409 %tmp1 = load <2 x i64>, <2 x i64>* %A
411 %tmp2 = load <4 x i32>, <4 x i32>* %B
412 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
413 %ext2 = zext <2 x i32> %high2 to <2 x i64>
415 %res = sub <2 x i64> %tmp1, %ext2