1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
3 define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
6 %tmp1 = load <8 x i16>, <8 x i16>* %A
7 %tmp2 = load <8 x i16>, <8 x i16>* %B
8 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
12 define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
13 ;CHECK-LABEL: addhn4h:
15 %tmp1 = load <4 x i32>, <4 x i32>* %A
16 %tmp2 = load <4 x i32>, <4 x i32>* %B
17 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
21 define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
22 ;CHECK-LABEL: addhn2s:
24 %tmp1 = load <2 x i64>, <2 x i64>* %A
25 %tmp2 = load <2 x i64>, <2 x i64>* %B
26 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
30 define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
31 ;CHECK-LABEL: addhn2_16b:
33 ;CHECK-NEXT: addhn2.16b
34 %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
35 %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
36 %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40 define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
41 ;CHECK-LABEL: addhn2_8h:
43 ;CHECK-NEXT: addhn2.8h
44 %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
45 %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
46 %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
50 define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
51 ;CHECK-LABEL: addhn2_4s:
53 ;CHECK-NEXT: addhn2.4s
54 %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
55 %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
56 %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
60 declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
61 declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
62 declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
65 define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
66 ;CHECK-LABEL: raddhn8b:
68 %tmp1 = load <8 x i16>, <8 x i16>* %A
69 %tmp2 = load <8 x i16>, <8 x i16>* %B
70 %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
74 define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
75 ;CHECK-LABEL: raddhn4h:
77 %tmp1 = load <4 x i32>, <4 x i32>* %A
78 %tmp2 = load <4 x i32>, <4 x i32>* %B
79 %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
83 define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
84 ;CHECK-LABEL: raddhn2s:
86 %tmp1 = load <2 x i64>, <2 x i64>* %A
87 %tmp2 = load <2 x i64>, <2 x i64>* %B
88 %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
92 define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
93 ;CHECK-LABEL: raddhn2_16b:
95 ;CHECK-NEXT: raddhn2.16b
96 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
97 %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
98 %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
102 define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
103 ;CHECK-LABEL: raddhn2_8h:
105 ;CHECK-NEXT: raddhn2.8h
106 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
107 %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
108 %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
112 define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
113 ;CHECK-LABEL: raddhn2_4s:
115 ;CHECK-NEXT: raddhn2.4s
116 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
117 %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
118 %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
122 declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
123 declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
124 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
126 define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
127 ;CHECK-LABEL: saddl8h:
129 %tmp1 = load <8 x i8>, <8 x i8>* %A
130 %tmp2 = load <8 x i8>, <8 x i8>* %B
131 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
132 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
133 %tmp5 = add <8 x i16> %tmp3, %tmp4
137 define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
138 ;CHECK-LABEL: saddl4s:
140 %tmp1 = load <4 x i16>, <4 x i16>* %A
141 %tmp2 = load <4 x i16>, <4 x i16>* %B
142 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
143 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
144 %tmp5 = add <4 x i32> %tmp3, %tmp4
148 define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
149 ;CHECK-LABEL: saddl2d:
151 %tmp1 = load <2 x i32>, <2 x i32>* %A
152 %tmp2 = load <2 x i32>, <2 x i32>* %B
153 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
154 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
155 %tmp5 = add <2 x i64> %tmp3, %tmp4
159 define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
160 ; CHECK-LABEL: saddl2_8h:
161 ; CHECK-NEXT: saddl2.8h v0, v0, v1
163 %tmp = bitcast <16 x i8> %a to <2 x i64>
164 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
165 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
166 %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
167 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
168 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
169 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
170 %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
171 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
175 define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
176 ; CHECK-LABEL: saddl2_4s:
177 ; CHECK-NEXT: saddl2.4s v0, v0, v1
179 %tmp = bitcast <8 x i16> %a to <2 x i64>
180 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
181 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
182 %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
183 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
184 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
185 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
186 %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
187 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
191 define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
192 ; CHECK-LABEL: saddl2_2d:
193 ; CHECK-NEXT: saddl2.2d v0, v0, v1
195 %tmp = bitcast <4 x i32> %a to <2 x i64>
196 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
197 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
198 %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
199 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
200 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
201 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
202 %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
203 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
207 define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
208 ;CHECK-LABEL: uaddl8h:
210 %tmp1 = load <8 x i8>, <8 x i8>* %A
211 %tmp2 = load <8 x i8>, <8 x i8>* %B
212 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
213 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
214 %tmp5 = add <8 x i16> %tmp3, %tmp4
218 define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
219 ;CHECK-LABEL: uaddl4s:
221 %tmp1 = load <4 x i16>, <4 x i16>* %A
222 %tmp2 = load <4 x i16>, <4 x i16>* %B
223 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
224 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
225 %tmp5 = add <4 x i32> %tmp3, %tmp4
229 define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
230 ;CHECK-LABEL: uaddl2d:
232 %tmp1 = load <2 x i32>, <2 x i32>* %A
233 %tmp2 = load <2 x i32>, <2 x i32>* %B
234 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
235 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
236 %tmp5 = add <2 x i64> %tmp3, %tmp4
241 define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
242 ; CHECK-LABEL: uaddl2_8h:
243 ; CHECK-NEXT: uaddl2.8h v0, v0, v1
245 %tmp = bitcast <16 x i8> %a to <2 x i64>
246 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
247 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
248 %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
249 %tmp2 = bitcast <16 x i8> %b to <2 x i64>
250 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
251 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
252 %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
253 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
257 define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
258 ; CHECK-LABEL: uaddl2_4s:
259 ; CHECK-NEXT: uaddl2.4s v0, v0, v1
261 %tmp = bitcast <8 x i16> %a to <2 x i64>
262 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
263 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
264 %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
265 %tmp2 = bitcast <8 x i16> %b to <2 x i64>
266 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
267 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
268 %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
269 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
273 define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
274 ; CHECK-LABEL: uaddl2_2d:
275 ; CHECK-NEXT: uaddl2.2d v0, v0, v1
277 %tmp = bitcast <4 x i32> %a to <2 x i64>
278 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
279 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
280 %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
281 %tmp2 = bitcast <4 x i32> %b to <2 x i64>
282 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
283 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
284 %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
285 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
289 define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
290 ;CHECK-LABEL: uaddw8h:
292 %tmp1 = load <8 x i16>, <8 x i16>* %A
293 %tmp2 = load <8 x i8>, <8 x i8>* %B
294 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
295 %tmp4 = add <8 x i16> %tmp1, %tmp3
299 define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
300 ;CHECK-LABEL: uaddw4s:
302 %tmp1 = load <4 x i32>, <4 x i32>* %A
303 %tmp2 = load <4 x i16>, <4 x i16>* %B
304 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
305 %tmp4 = add <4 x i32> %tmp1, %tmp3
309 define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
310 ;CHECK-LABEL: uaddw2d:
312 %tmp1 = load <2 x i64>, <2 x i64>* %A
313 %tmp2 = load <2 x i32>, <2 x i32>* %B
314 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
315 %tmp4 = add <2 x i64> %tmp1, %tmp3
319 define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
320 ;CHECK-LABEL: uaddw2_8h:
322 %tmp1 = load <8 x i16>, <8 x i16>* %A
324 %tmp2 = load <16 x i8>, <16 x i8>* %B
325 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
326 %ext2 = zext <8 x i8> %high2 to <8 x i16>
328 %res = add <8 x i16> %tmp1, %ext2
332 define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
333 ;CHECK-LABEL: uaddw2_4s:
335 %tmp1 = load <4 x i32>, <4 x i32>* %A
337 %tmp2 = load <8 x i16>, <8 x i16>* %B
338 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
339 %ext2 = zext <4 x i16> %high2 to <4 x i32>
341 %res = add <4 x i32> %tmp1, %ext2
345 define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
346 ;CHECK-LABEL: uaddw2_2d:
348 %tmp1 = load <2 x i64>, <2 x i64>* %A
350 %tmp2 = load <4 x i32>, <4 x i32>* %B
351 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
352 %ext2 = zext <2 x i32> %high2 to <2 x i64>
354 %res = add <2 x i64> %tmp1, %ext2
358 define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
359 ;CHECK-LABEL: saddw8h:
361 %tmp1 = load <8 x i16>, <8 x i16>* %A
362 %tmp2 = load <8 x i8>, <8 x i8>* %B
363 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
364 %tmp4 = add <8 x i16> %tmp1, %tmp3
368 define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
369 ;CHECK-LABEL: saddw4s:
371 %tmp1 = load <4 x i32>, <4 x i32>* %A
372 %tmp2 = load <4 x i16>, <4 x i16>* %B
373 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
374 %tmp4 = add <4 x i32> %tmp1, %tmp3
378 define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
379 ;CHECK-LABEL: saddw2d:
381 %tmp1 = load <2 x i64>, <2 x i64>* %A
382 %tmp2 = load <2 x i32>, <2 x i32>* %B
383 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
384 %tmp4 = add <2 x i64> %tmp1, %tmp3
388 define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
389 ;CHECK-LABEL: saddw2_8h:
391 %tmp1 = load <8 x i16>, <8 x i16>* %A
393 %tmp2 = load <16 x i8>, <16 x i8>* %B
394 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
395 %ext2 = sext <8 x i8> %high2 to <8 x i16>
397 %res = add <8 x i16> %tmp1, %ext2
401 define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
402 ;CHECK-LABEL: saddw2_4s:
404 %tmp1 = load <4 x i32>, <4 x i32>* %A
406 %tmp2 = load <8 x i16>, <8 x i16>* %B
407 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
408 %ext2 = sext <4 x i16> %high2 to <4 x i32>
410 %res = add <4 x i32> %tmp1, %ext2
414 define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
415 ;CHECK-LABEL: saddw2_2d:
417 %tmp1 = load <2 x i64>, <2 x i64>* %A
419 %tmp2 = load <4 x i32>, <4 x i32>* %B
420 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
421 %ext2 = sext <2 x i32> %high2 to <2 x i64>
423 %res = add <2 x i64> %tmp1, %ext2
427 define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
428 ;CHECK-LABEL: saddlp4h:
430 %tmp1 = load <8 x i8>, <8 x i8>* %A
431 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
435 define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
436 ;CHECK-LABEL: saddlp2s:
438 %tmp1 = load <4 x i16>, <4 x i16>* %A
439 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
443 define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
444 ;CHECK-LABEL: saddlp1d:
446 %tmp1 = load <2 x i32>, <2 x i32>* %A
447 %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
451 define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
452 ;CHECK-LABEL: saddlp8h:
454 %tmp1 = load <16 x i8>, <16 x i8>* %A
455 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
459 define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
460 ;CHECK-LABEL: saddlp4s:
462 %tmp1 = load <8 x i16>, <8 x i16>* %A
463 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
467 define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
468 ;CHECK-LABEL: saddlp2d:
470 %tmp1 = load <4 x i32>, <4 x i32>* %A
471 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
475 declare <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
476 declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
477 declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
479 declare <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
480 declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
481 declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
483 define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
484 ;CHECK-LABEL: uaddlp4h:
486 %tmp1 = load <8 x i8>, <8 x i8>* %A
487 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
491 define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
492 ;CHECK-LABEL: uaddlp2s:
494 %tmp1 = load <4 x i16>, <4 x i16>* %A
495 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
499 define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
500 ;CHECK-LABEL: uaddlp1d:
502 %tmp1 = load <2 x i32>, <2 x i32>* %A
503 %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
507 define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
508 ;CHECK-LABEL: uaddlp8h:
510 %tmp1 = load <16 x i8>, <16 x i8>* %A
511 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
515 define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
516 ;CHECK-LABEL: uaddlp4s:
518 %tmp1 = load <8 x i16>, <8 x i16>* %A
519 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
523 define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
524 ;CHECK-LABEL: uaddlp2d:
526 %tmp1 = load <4 x i32>, <4 x i32>* %A
527 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
531 declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
532 declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
533 declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
535 declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
536 declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
537 declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
539 define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
540 ;CHECK-LABEL: sadalp4h:
542 %tmp1 = load <8 x i8>, <8 x i8>* %A
543 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
544 %tmp4 = load <4 x i16>, <4 x i16>* %B
545 %tmp5 = add <4 x i16> %tmp3, %tmp4
549 define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
550 ;CHECK-LABEL: sadalp2s:
552 %tmp1 = load <4 x i16>, <4 x i16>* %A
553 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
554 %tmp4 = load <2 x i32>, <2 x i32>* %B
555 %tmp5 = add <2 x i32> %tmp3, %tmp4
559 define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
560 ;CHECK-LABEL: sadalp8h:
562 %tmp1 = load <16 x i8>, <16 x i8>* %A
563 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
564 %tmp4 = load <8 x i16>, <8 x i16>* %B
565 %tmp5 = add <8 x i16> %tmp3, %tmp4
569 define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
570 ;CHECK-LABEL: sadalp4s:
572 %tmp1 = load <8 x i16>, <8 x i16>* %A
573 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
574 %tmp4 = load <4 x i32>, <4 x i32>* %B
575 %tmp5 = add <4 x i32> %tmp3, %tmp4
579 define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
580 ;CHECK-LABEL: sadalp2d:
582 %tmp1 = load <4 x i32>, <4 x i32>* %A
583 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
584 %tmp4 = load <2 x i64>, <2 x i64>* %B
585 %tmp5 = add <2 x i64> %tmp3, %tmp4
589 define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
590 ;CHECK-LABEL: uadalp4h:
592 %tmp1 = load <8 x i8>, <8 x i8>* %A
593 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
594 %tmp4 = load <4 x i16>, <4 x i16>* %B
595 %tmp5 = add <4 x i16> %tmp3, %tmp4
599 define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
600 ;CHECK-LABEL: uadalp2s:
602 %tmp1 = load <4 x i16>, <4 x i16>* %A
603 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
604 %tmp4 = load <2 x i32>, <2 x i32>* %B
605 %tmp5 = add <2 x i32> %tmp3, %tmp4
609 define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
610 ;CHECK-LABEL: uadalp8h:
612 %tmp1 = load <16 x i8>, <16 x i8>* %A
613 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
614 %tmp4 = load <8 x i16>, <8 x i16>* %B
615 %tmp5 = add <8 x i16> %tmp3, %tmp4
619 define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
620 ;CHECK-LABEL: uadalp4s:
622 %tmp1 = load <8 x i16>, <8 x i16>* %A
623 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
624 %tmp4 = load <4 x i32>, <4 x i32>* %B
625 %tmp5 = add <4 x i32> %tmp3, %tmp4
629 define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
630 ;CHECK-LABEL: uadalp2d:
632 %tmp1 = load <4 x i32>, <4 x i32>* %A
633 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
634 %tmp4 = load <2 x i64>, <2 x i64>* %B
635 %tmp5 = add <2 x i64> %tmp3, %tmp4
639 define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
640 ;CHECK-LABEL: addp_8b:
642 %tmp1 = load <8 x i8>, <8 x i8>* %A
643 %tmp2 = load <8 x i8>, <8 x i8>* %B
644 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
648 define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
649 ;CHECK-LABEL: addp_16b:
651 %tmp1 = load <16 x i8>, <16 x i8>* %A
652 %tmp2 = load <16 x i8>, <16 x i8>* %B
653 %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
657 define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
658 ;CHECK-LABEL: addp_4h:
660 %tmp1 = load <4 x i16>, <4 x i16>* %A
661 %tmp2 = load <4 x i16>, <4 x i16>* %B
662 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
666 define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
667 ;CHECK-LABEL: addp_8h:
669 %tmp1 = load <8 x i16>, <8 x i16>* %A
670 %tmp2 = load <8 x i16>, <8 x i16>* %B
671 %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
675 define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
676 ;CHECK-LABEL: addp_2s:
678 %tmp1 = load <2 x i32>, <2 x i32>* %A
679 %tmp2 = load <2 x i32>, <2 x i32>* %B
680 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
684 define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
685 ;CHECK-LABEL: addp_4s:
687 %tmp1 = load <4 x i32>, <4 x i32>* %A
688 %tmp2 = load <4 x i32>, <4 x i32>* %B
689 %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
693 define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
694 ;CHECK-LABEL: addp_2d:
696 %tmp1 = load <2 x i64>, <2 x i64>* %A
697 %tmp2 = load <2 x i64>, <2 x i64>* %B
698 %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
702 declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
703 declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
704 declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
705 declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
706 declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
707 declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
708 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
710 define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
711 ;CHECK-LABEL: faddp_2s:
713 %tmp1 = load <2 x float>, <2 x float>* %A
714 %tmp2 = load <2 x float>, <2 x float>* %B
715 %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
716 ret <2 x float> %tmp3
719 define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
720 ;CHECK-LABEL: faddp_4s:
722 %tmp1 = load <4 x float>, <4 x float>* %A
723 %tmp2 = load <4 x float>, <4 x float>* %B
724 %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
725 ret <4 x float> %tmp3
728 define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
729 ;CHECK-LABEL: faddp_2d:
731 %tmp1 = load <2 x double>, <2 x double>* %A
732 %tmp2 = load <2 x double>, <2 x double>* %B
733 %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
734 ret <2 x double> %tmp3
737 declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
738 declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
739 declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
741 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
742 ; CHECK-LABEL: uaddl_duprhs
745 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
746 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
748 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
750 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
751 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
753 %res = add <2 x i64> %lhs.ext, %rhs.ext
757 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
758 ; CHECK-LABEL: uaddl2_duprhs
761 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
762 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
764 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
766 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
767 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
769 %res = add <2 x i64> %lhs.ext, %rhs.ext
773 define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
774 ; CHECK-LABEL: saddl_duplhs
777 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
778 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
780 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
782 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
783 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
785 %res = add <2 x i64> %lhs.ext, %rhs.ext
789 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
790 ; CHECK-LABEL: saddl2_duplhs
793 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
794 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
796 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
798 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
799 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
801 %res = add <2 x i64> %lhs.ext, %rhs.ext
805 define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
806 ; CHECK-LABEL: usubl_duprhs
809 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
810 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
812 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
814 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
815 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
817 %res = sub <2 x i64> %lhs.ext, %rhs.ext
821 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
822 ; CHECK-LABEL: usubl2_duprhs
825 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
826 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
828 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
830 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
831 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
833 %res = sub <2 x i64> %lhs.ext, %rhs.ext
837 define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
838 ; CHECK-LABEL: ssubl_duplhs:
841 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
842 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
844 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
846 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
847 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
849 %res = sub <2 x i64> %lhs.ext, %rhs.ext
853 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
854 ; CHECK-LABEL: ssubl2_duplhs:
857 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
858 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
860 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
862 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
863 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
865 %res = sub <2 x i64> %lhs.ext, %rhs.ext
869 define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
870 ;CHECK-LABEL: addhn8b_natural:
872 %tmp1 = load <8 x i16>, <8 x i16>* %A
873 %tmp2 = load <8 x i16>, <8 x i16>* %B
874 %sum = add <8 x i16> %tmp1, %tmp2
875 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
876 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
877 ret <8 x i8> %narrowed
880 define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
881 ;CHECK-LABEL: addhn4h_natural:
883 %tmp1 = load <4 x i32>, <4 x i32>* %A
884 %tmp2 = load <4 x i32>, <4 x i32>* %B
885 %sum = add <4 x i32> %tmp1, %tmp2
886 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
887 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
888 ret <4 x i16> %narrowed
891 define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
892 ;CHECK-LABEL: addhn2s_natural:
894 %tmp1 = load <2 x i64>, <2 x i64>* %A
895 %tmp2 = load <2 x i64>, <2 x i64>* %B
896 %sum = add <2 x i64> %tmp1, %tmp2
897 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
898 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
899 ret <2 x i32> %narrowed
902 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
903 ;CHECK-LABEL: addhn2_16b_natural:
905 %tmp1 = load <8 x i16>, <8 x i16>* %A
906 %tmp2 = load <8 x i16>, <8 x i16>* %B
907 %sum = add <8 x i16> %tmp1, %tmp2
908 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
909 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
910 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
914 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
915 ;CHECK-LABEL: addhn2_8h_natural:
917 %tmp1 = load <4 x i32>, <4 x i32>* %A
918 %tmp2 = load <4 x i32>, <4 x i32>* %B
919 %sum = add <4 x i32> %tmp1, %tmp2
920 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
921 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
922 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
926 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
927 ;CHECK-LABEL: addhn2_4s_natural:
929 %tmp1 = load <2 x i64>, <2 x i64>* %A
930 %tmp2 = load <2 x i64>, <2 x i64>* %B
931 %sum = add <2 x i64> %tmp1, %tmp2
932 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
933 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
934 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
938 define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
939 ;CHECK-LABEL: subhn8b_natural:
941 %tmp1 = load <8 x i16>, <8 x i16>* %A
942 %tmp2 = load <8 x i16>, <8 x i16>* %B
943 %diff = sub <8 x i16> %tmp1, %tmp2
944 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
945 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
946 ret <8 x i8> %narrowed
949 define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
950 ;CHECK-LABEL: subhn4h_natural:
952 %tmp1 = load <4 x i32>, <4 x i32>* %A
953 %tmp2 = load <4 x i32>, <4 x i32>* %B
954 %diff = sub <4 x i32> %tmp1, %tmp2
955 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
956 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
957 ret <4 x i16> %narrowed
960 define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
961 ;CHECK-LABEL: subhn2s_natural:
963 %tmp1 = load <2 x i64>, <2 x i64>* %A
964 %tmp2 = load <2 x i64>, <2 x i64>* %B
965 %diff = sub <2 x i64> %tmp1, %tmp2
966 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
967 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
968 ret <2 x i32> %narrowed
971 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
972 ;CHECK-LABEL: subhn2_16b_natural:
974 %tmp1 = load <8 x i16>, <8 x i16>* %A
975 %tmp2 = load <8 x i16>, <8 x i16>* %B
976 %diff = sub <8 x i16> %tmp1, %tmp2
977 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
978 %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
979 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
983 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
984 ;CHECK-LABEL: subhn2_8h_natural:
986 %tmp1 = load <4 x i32>, <4 x i32>* %A
987 %tmp2 = load <4 x i32>, <4 x i32>* %B
988 %diff = sub <4 x i32> %tmp1, %tmp2
989 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
990 %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
991 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
995 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
996 ;CHECK-LABEL: subhn2_4s_natural:
998 %tmp1 = load <2 x i64>, <2 x i64>* %A
999 %tmp2 = load <2 x i64>, <2 x i64>* %B
1000 %diff = sub <2 x i64> %tmp1, %tmp2
1001 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
1002 %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1003 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>