1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <4 x i32> @sext_trunc_i32(<4 x i32> %a) {
5 ; CHECK-LABEL: sext_trunc_i32:
6 ; CHECK: @ %bb.0: @ %entry
9 %sa = sext <4 x i32> %a to <4 x i64>
10 %t = trunc <4 x i64> %sa to <4 x i32>
14 define arm_aapcs_vfpcc <8 x i16> @sext_trunc_i16(<8 x i16> %a) {
15 ; CHECK-LABEL: sext_trunc_i16:
16 ; CHECK: @ %bb.0: @ %entry
19 %sa = sext <8 x i16> %a to <8 x i32>
20 %t = trunc <8 x i32> %sa to <8 x i16>
24 define arm_aapcs_vfpcc <16 x i8> @sext_trunc_i8(<16 x i8> %a) {
25 ; CHECK-LABEL: sext_trunc_i8:
26 ; CHECK: @ %bb.0: @ %entry
29 %sa = sext <16 x i8> %a to <16 x i16>
30 %t = trunc <16 x i16> %sa to <16 x i8>
34 define arm_aapcs_vfpcc <4 x i32> @zext_trunc_i32(<4 x i32> %a) {
35 ; CHECK-LABEL: zext_trunc_i32:
36 ; CHECK: @ %bb.0: @ %entry
39 %sa = zext <4 x i32> %a to <4 x i64>
40 %t = trunc <4 x i64> %sa to <4 x i32>
44 define arm_aapcs_vfpcc <8 x i16> @zext_trunc_i16(<8 x i16> %a) {
45 ; CHECK-LABEL: zext_trunc_i16:
46 ; CHECK: @ %bb.0: @ %entry
49 %sa = zext <8 x i16> %a to <8 x i32>
50 %t = trunc <8 x i32> %sa to <8 x i16>
54 define arm_aapcs_vfpcc <16 x i8> @zext_trunc_i8(<16 x i8> %a) {
55 ; CHECK-LABEL: zext_trunc_i8:
56 ; CHECK: @ %bb.0: @ %entry
59 %sa = zext <16 x i8> %a to <16 x i16>
60 %t = trunc <16 x i16> %sa to <16 x i8>
64 define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
65 ; CHECK-LABEL: ext_add_trunc_i32:
66 ; CHECK: @ %bb.0: @ %entry
67 ; CHECK-NEXT: vmov.f32 s8, s6
68 ; CHECK-NEXT: vmov.f32 s6, s7
69 ; CHECK-NEXT: vmov r0, s8
70 ; CHECK-NEXT: vmov.f32 s8, s2
71 ; CHECK-NEXT: vmov.f32 s2, s3
72 ; CHECK-NEXT: vmov r1, s8
73 ; CHECK-NEXT: vmov r2, s2
74 ; CHECK-NEXT: vmov.f32 s2, s5
75 ; CHECK-NEXT: add.w r12, r1, r0
76 ; CHECK-NEXT: vmov r1, s6
77 ; CHECK-NEXT: vmov r0, s0
78 ; CHECK-NEXT: add r1, r2
79 ; CHECK-NEXT: vmov r2, s2
80 ; CHECK-NEXT: vmov.f32 s2, s1
81 ; CHECK-NEXT: vmov r3, s2
82 ; CHECK-NEXT: add r2, r3
83 ; CHECK-NEXT: vmov r3, s4
84 ; CHECK-NEXT: add r0, r3
85 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r12
86 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
89 %sa = sext <4 x i32> %a to <4 x i64>
90 %sb = zext <4 x i32> %b to <4 x i64>
91 %add = add <4 x i64> %sa, %sb
92 %t = trunc <4 x i64> %add to <4 x i32>
96 define arm_aapcs_vfpcc <8 x i16> @ext_add_trunc_v8i16(<8 x i16> %a, <8 x i16> %b) {
97 ; CHECK-LABEL: ext_add_trunc_v8i16:
98 ; CHECK: @ %bb.0: @ %entry
99 ; CHECK-NEXT: vrev32.16 q3, q0
100 ; CHECK-NEXT: vrev32.16 q2, q1
101 ; CHECK-NEXT: vadd.i32 q2, q3, q2
102 ; CHECK-NEXT: vadd.i32 q0, q0, q1
103 ; CHECK-NEXT: vmovnt.i32 q0, q2
106 %sa = sext <8 x i16> %a to <8 x i32>
107 %sb = zext <8 x i16> %b to <8 x i32>
108 %add = add <8 x i32> %sa, %sb
109 %t = trunc <8 x i32> %add to <8 x i16>
113 define arm_aapcs_vfpcc <16 x i8> @ext_add_trunc_v16i8(<16 x i8> %a, <16 x i8> %b) {
114 ; CHECK-LABEL: ext_add_trunc_v16i8:
115 ; CHECK: @ %bb.0: @ %entry
116 ; CHECK-NEXT: vrev16.8 q3, q0
117 ; CHECK-NEXT: vrev16.8 q2, q1
118 ; CHECK-NEXT: vadd.i16 q2, q3, q2
119 ; CHECK-NEXT: vadd.i16 q0, q0, q1
120 ; CHECK-NEXT: vmovnt.i16 q0, q2
123 %sa = sext <16 x i8> %a to <16 x i16>
124 %sb = zext <16 x i8> %b to <16 x i16>
125 %add = add <16 x i16> %sa, %sb
126 %t = trunc <16 x i16> %add to <16 x i8>
130 define arm_aapcs_vfpcc <16 x i16> @ext_add_trunc_v16i16(<16 x i16> %a, <16 x i16> %b) {
131 ; CHECK-LABEL: ext_add_trunc_v16i16:
132 ; CHECK: @ %bb.0: @ %entry
133 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
134 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
135 ; CHECK-NEXT: vrev32.16 q5, q0
136 ; CHECK-NEXT: vrev32.16 q4, q2
137 ; CHECK-NEXT: vadd.i32 q0, q0, q2
138 ; CHECK-NEXT: vadd.i32 q4, q5, q4
139 ; CHECK-NEXT: vmovnt.i32 q0, q4
140 ; CHECK-NEXT: vrev32.16 q4, q1
141 ; CHECK-NEXT: vrev32.16 q2, q3
142 ; CHECK-NEXT: vadd.i32 q1, q1, q3
143 ; CHECK-NEXT: vadd.i32 q2, q4, q2
144 ; CHECK-NEXT: vmovnt.i32 q1, q2
145 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
148 %sa = sext <16 x i16> %a to <16 x i32>
149 %sb = zext <16 x i16> %b to <16 x i32>
150 %add = add <16 x i32> %sa, %sb
151 %t = trunc <16 x i32> %add to <16 x i16>
155 define arm_aapcs_vfpcc <32 x i8> @ext_add_trunc_v32i8(<32 x i8> %a, <32 x i8> %b) {
156 ; CHECK-LABEL: ext_add_trunc_v32i8:
157 ; CHECK: @ %bb.0: @ %entry
158 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
159 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
160 ; CHECK-NEXT: vrev16.8 q5, q0
161 ; CHECK-NEXT: vrev16.8 q4, q2
162 ; CHECK-NEXT: vadd.i16 q0, q0, q2
163 ; CHECK-NEXT: vadd.i16 q4, q5, q4
164 ; CHECK-NEXT: vmovnt.i16 q0, q4
165 ; CHECK-NEXT: vrev16.8 q4, q1
166 ; CHECK-NEXT: vrev16.8 q2, q3
167 ; CHECK-NEXT: vadd.i16 q1, q1, q3
168 ; CHECK-NEXT: vadd.i16 q2, q4, q2
169 ; CHECK-NEXT: vmovnt.i16 q1, q2
170 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
173 %sa = sext <32 x i8> %a to <32 x i16>
174 %sb = zext <32 x i8> %b to <32 x i16>
175 %add = add <32 x i16> %sa, %sb
176 %t = trunc <32 x i16> %add to <32 x i8>
180 define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
181 ; CHECK-LABEL: ext_add_ashr_trunc_i32:
182 ; CHECK: @ %bb.0: @ %entry
183 ; CHECK-NEXT: .save {r4, r5, r6, lr}
184 ; CHECK-NEXT: push {r4, r5, r6, lr}
185 ; CHECK-NEXT: vmov.f32 s12, s6
186 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff
187 ; CHECK-NEXT: vmov.f32 s6, s5
188 ; CHECK-NEXT: vmov.f32 s14, s7
189 ; CHECK-NEXT: vand q1, q1, q2
190 ; CHECK-NEXT: vmov r2, r3, d2
191 ; CHECK-NEXT: vand q3, q3, q2
192 ; CHECK-NEXT: vmov.f32 s4, s2
193 ; CHECK-NEXT: vmov r0, r1, d6
194 ; CHECK-NEXT: vmov.f32 s2, s3
195 ; CHECK-NEXT: vmov.f32 s10, s1
196 ; CHECK-NEXT: vmov r12, lr, d7
197 ; CHECK-NEXT: vmov r4, s4
198 ; CHECK-NEXT: adds r0, r0, r4
199 ; CHECK-NEXT: asr.w r5, r4, #31
200 ; CHECK-NEXT: adcs r1, r5
201 ; CHECK-NEXT: lsrl r0, r1, #1
202 ; CHECK-NEXT: vmov r1, s0
203 ; CHECK-NEXT: adds r2, r2, r1
204 ; CHECK-NEXT: asr.w r4, r1, #31
205 ; CHECK-NEXT: adcs r3, r4
206 ; CHECK-NEXT: lsrl r2, r3, #1
207 ; CHECK-NEXT: vmov r1, r5, d3
208 ; CHECK-NEXT: vmov r3, s2
209 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
210 ; CHECK-NEXT: vmov r0, s10
211 ; CHECK-NEXT: adds.w r4, r3, r12
212 ; CHECK-NEXT: asr.w r6, r3, #31
213 ; CHECK-NEXT: adc.w r3, r6, lr
214 ; CHECK-NEXT: asrs r2, r0, #31
215 ; CHECK-NEXT: adds r0, r0, r1
216 ; CHECK-NEXT: adc.w r1, r2, r5
217 ; CHECK-NEXT: lsrl r4, r3, #1
218 ; CHECK-NEXT: lsrl r0, r1, #1
219 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r4
220 ; CHECK-NEXT: pop {r4, r5, r6, pc}
222 %sa = sext <4 x i32> %a to <4 x i64>
223 %sb = zext <4 x i32> %b to <4 x i64>
224 %add = add <4 x i64> %sa, %sb
225 %sh = ashr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
226 %t = trunc <4 x i64> %sh to <4 x i32>
230 define arm_aapcs_vfpcc <8 x i16> @ext_add_ashr_trunc_i16(<8 x i16> %a, <8 x i16> %b) {
231 ; CHECK-LABEL: ext_add_ashr_trunc_i16:
232 ; CHECK: @ %bb.0: @ %entry
233 ; CHECK-NEXT: vmovlb.u16 q2, q1
234 ; CHECK-NEXT: vmovlb.s16 q3, q0
235 ; CHECK-NEXT: vmovlt.u16 q1, q1
236 ; CHECK-NEXT: vmovlt.s16 q0, q0
237 ; CHECK-NEXT: vadd.i32 q0, q0, q1
238 ; CHECK-NEXT: vadd.i32 q2, q3, q2
239 ; CHECK-NEXT: vshr.u32 q1, q0, #1
240 ; CHECK-NEXT: vshr.u32 q0, q2, #1
241 ; CHECK-NEXT: vmovnt.i32 q0, q1
244 %sa = sext <8 x i16> %a to <8 x i32>
245 %sb = zext <8 x i16> %b to <8 x i32>
246 %add = add <8 x i32> %sa, %sb
247 %sh = ashr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
248 %t = trunc <8 x i32> %sh to <8 x i16>
252 define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8(<16 x i8> %a, <16 x i8> %b) {
253 ; CHECK-LABEL: ext_add_ashr_trunc_i8:
254 ; CHECK: @ %bb.0: @ %entry
255 ; CHECK-NEXT: vmovlb.u8 q2, q1
256 ; CHECK-NEXT: vmovlb.s8 q3, q0
257 ; CHECK-NEXT: vmovlt.u8 q1, q1
258 ; CHECK-NEXT: vmovlt.s8 q0, q0
259 ; CHECK-NEXT: vadd.i16 q0, q0, q1
260 ; CHECK-NEXT: vadd.i16 q2, q3, q2
261 ; CHECK-NEXT: vshr.u16 q1, q0, #1
262 ; CHECK-NEXT: vshr.u16 q0, q2, #1
263 ; CHECK-NEXT: vmovnt.i16 q0, q1
266 %sa = sext <16 x i8> %a to <16 x i16>
267 %sb = zext <16 x i8> %b to <16 x i16>
268 %add = add <16 x i16> %sa, %sb
269 %sh = ashr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
270 %t = trunc <16 x i16> %sh to <16 x i8>
274 define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) {
275 ; CHECK-LABEL: ext_add_ashr_trunc_i8i32:
276 ; CHECK: @ %bb.0: @ %entry
277 ; CHECK-NEXT: .save {r4, r5, r7, lr}
278 ; CHECK-NEXT: push {r4, r5, r7, lr}
279 ; CHECK-NEXT: .pad #112
280 ; CHECK-NEXT: sub sp, #112
281 ; CHECK-NEXT: add r1, sp, #16
282 ; CHECK-NEXT: mov r4, sp
283 ; CHECK-NEXT: vstrw.32 q1, [r1]
284 ; CHECK-NEXT: vstrw.32 q0, [r4]
285 ; CHECK-NEXT: vldrb.u16 q0, [r1, #8]
286 ; CHECK-NEXT: add r3, sp, #64
287 ; CHECK-NEXT: add r5, sp, #32
288 ; CHECK-NEXT: add r0, sp, #80
289 ; CHECK-NEXT: vstrw.32 q0, [r3]
290 ; CHECK-NEXT: add r2, sp, #48
291 ; CHECK-NEXT: vldrb.s16 q0, [r4, #8]
292 ; CHECK-NEXT: vstrw.32 q0, [r5]
293 ; CHECK-NEXT: vldrb.u16 q0, [r1]
294 ; CHECK-NEXT: add r1, sp, #96
295 ; CHECK-NEXT: vstrw.32 q0, [r0]
296 ; CHECK-NEXT: vldrb.s16 q0, [r4]
297 ; CHECK-NEXT: vstrw.32 q0, [r2]
298 ; CHECK-NEXT: vldrh.u32 q0, [r3, #8]
299 ; CHECK-NEXT: vldrh.s32 q1, [r5, #8]
300 ; CHECK-NEXT: vadd.i32 q0, q1, q0
301 ; CHECK-NEXT: vshr.u32 q0, q0, #1
302 ; CHECK-NEXT: vstrb.32 q0, [r1, #12]
303 ; CHECK-NEXT: vldrh.u32 q0, [r3]
304 ; CHECK-NEXT: vldrh.s32 q1, [r5]
305 ; CHECK-NEXT: vadd.i32 q0, q1, q0
306 ; CHECK-NEXT: vshr.u32 q0, q0, #1
307 ; CHECK-NEXT: vstrb.32 q0, [r1, #8]
308 ; CHECK-NEXT: vldrh.u32 q0, [r0, #8]
309 ; CHECK-NEXT: vldrh.s32 q1, [r2, #8]
310 ; CHECK-NEXT: vadd.i32 q0, q1, q0
311 ; CHECK-NEXT: vshr.u32 q0, q0, #1
312 ; CHECK-NEXT: vstrb.32 q0, [r1, #4]
313 ; CHECK-NEXT: vldrh.u32 q0, [r0]
314 ; CHECK-NEXT: vldrh.s32 q1, [r2]
315 ; CHECK-NEXT: vadd.i32 q0, q1, q0
316 ; CHECK-NEXT: vshr.u32 q0, q0, #1
317 ; CHECK-NEXT: vstrb.32 q0, [r1]
318 ; CHECK-NEXT: vldrw.u32 q0, [r1]
319 ; CHECK-NEXT: add sp, #112
320 ; CHECK-NEXT: pop {r4, r5, r7, pc}
322 %sa = sext <16 x i8> %a to <16 x i32>
323 %sb = zext <16 x i8> %b to <16 x i32>
324 %add = add <16 x i32> %sa, %sb
325 %sh = ashr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
326 %t = trunc <16 x i32> %sh to <16 x i8>
330 define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
331 ; CHECK-LABEL: ext_ops_trunc_i32:
332 ; CHECK: @ %bb.0: @ %entry
333 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
334 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
335 ; CHECK-NEXT: .pad #4
336 ; CHECK-NEXT: sub sp, #4
337 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
338 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
339 ; CHECK-NEXT: .pad #8
340 ; CHECK-NEXT: sub sp, #8
341 ; CHECK-NEXT: vmov.f32 s8, s6
342 ; CHECK-NEXT: vmov.i64 q3, #0xffffffff
343 ; CHECK-NEXT: vmov.f32 s10, s7
344 ; CHECK-NEXT: vmov.f32 s6, s5
345 ; CHECK-NEXT: vand q2, q2, q3
346 ; CHECK-NEXT: vand q1, q1, q3
347 ; CHECK-NEXT: vmov.f32 s12, s2
348 ; CHECK-NEXT: vmov.f32 s2, s3
349 ; CHECK-NEXT: vmov r12, r2, d5
350 ; CHECK-NEXT: vmov r8, r9, d3
351 ; CHECK-NEXT: vmov r1, s2
352 ; CHECK-NEXT: vmov.f32 s2, s1
353 ; CHECK-NEXT: vmov lr, s2
354 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
355 ; CHECK-NEXT: adds.w r4, r1, r12
356 ; CHECK-NEXT: asr.w r0, r1, #31
357 ; CHECK-NEXT: adc.w r5, r0, r2
358 ; CHECK-NEXT: asrl r4, r5, r12
359 ; CHECK-NEXT: subs.w r0, r4, r12
360 ; CHECK-NEXT: sbc.w r2, r5, r2
361 ; CHECK-NEXT: umull r0, r4, r0, r12
362 ; CHECK-NEXT: adds.w r6, lr, r8
363 ; CHECK-NEXT: mla r3, r2, r12, r4
364 ; CHECK-NEXT: asr.w r5, lr, #31
365 ; CHECK-NEXT: adc.w r5, r5, r9
366 ; CHECK-NEXT: rsbs r2, r1, #0
367 ; CHECK-NEXT: asrl r6, r5, r8
368 ; CHECK-NEXT: lsll r0, r3, r2
369 ; CHECK-NEXT: subs.w r7, r6, r8
370 ; CHECK-NEXT: vmov r6, r2, d4
371 ; CHECK-NEXT: sbc.w r10, r5, r9
372 ; CHECK-NEXT: vmov r5, s12
373 ; CHECK-NEXT: lsll r0, r3, r12
374 ; CHECK-NEXT: adds r4, r5, r6
375 ; CHECK-NEXT: asr.w r3, r5, #31
376 ; CHECK-NEXT: adcs r3, r2
377 ; CHECK-NEXT: asrl r4, r3, r6
378 ; CHECK-NEXT: subs r4, r4, r6
379 ; CHECK-NEXT: sbc.w r2, r3, r2
380 ; CHECK-NEXT: umull r4, r3, r4, r6
381 ; CHECK-NEXT: mla r3, r2, r6, r3
382 ; CHECK-NEXT: rsbs r2, r5, #0
383 ; CHECK-NEXT: lsll r4, r3, r2
384 ; CHECK-NEXT: lsll r4, r3, r6
385 ; CHECK-NEXT: eors r6, r5
386 ; CHECK-NEXT: vmov q3[2], q3[0], r4, r0
387 ; CHECK-NEXT: umull r2, r0, r7, r8
388 ; CHECK-NEXT: orr.w r6, r6, r5, asr #31
389 ; CHECK-NEXT: mul r3, r7, r9
390 ; CHECK-NEXT: vmov r7, s0
391 ; CHECK-NEXT: orrs r0, r3
392 ; CHECK-NEXT: vmov r3, r4, d2
393 ; CHECK-NEXT: mla r11, r10, r8, r0
394 ; CHECK-NEXT: asr.w r9, r7, #31
395 ; CHECK-NEXT: adds r0, r7, r3
396 ; CHECK-NEXT: adc.w r9, r9, r4
397 ; CHECK-NEXT: asrl r0, r9, r3
398 ; CHECK-NEXT: subs.w r10, r0, r3
399 ; CHECK-NEXT: sbc.w r9, r9, r4
400 ; CHECK-NEXT: umull r0, r1, r10, r3
401 ; CHECK-NEXT: mul r4, r10, r4
402 ; CHECK-NEXT: orr.w r10, r1, r4
403 ; CHECK-NEXT: eor.w r1, lr, r8
404 ; CHECK-NEXT: orr.w r1, r1, lr, asr #31
405 ; CHECK-NEXT: eor.w r4, r7, r3
406 ; CHECK-NEXT: cmp r1, #0
407 ; CHECK-NEXT: orr.w r4, r4, r7, asr #31
408 ; CHECK-NEXT: cset r1, eq
409 ; CHECK-NEXT: rsbs r7, r7, #0
410 ; CHECK-NEXT: cmp r1, #0
411 ; CHECK-NEXT: csetm r1, ne
412 ; CHECK-NEXT: cmp r4, #0
413 ; CHECK-NEXT: cset r4, eq
414 ; CHECK-NEXT: cmp r4, #0
415 ; CHECK-NEXT: csetm r4, ne
416 ; CHECK-NEXT: vmov.32 q0[1], r4
417 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r1
418 ; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
419 ; CHECK-NEXT: vbic q4, q1, q0
420 ; CHECK-NEXT: eor.w r1, r4, r12
421 ; CHECK-NEXT: orr.w r1, r1, r4, asr #31
422 ; CHECK-NEXT: cmp r1, #0
423 ; CHECK-NEXT: cset r1, eq
424 ; CHECK-NEXT: cmp r1, #0
425 ; CHECK-NEXT: csetm r1, ne
426 ; CHECK-NEXT: cmp r6, #0
427 ; CHECK-NEXT: cset r6, eq
428 ; CHECK-NEXT: cmp r6, #0
429 ; CHECK-NEXT: csetm r6, ne
430 ; CHECK-NEXT: vmov.32 q5[1], r6
431 ; CHECK-NEXT: vmov q5[2], q5[0], r6, r1
432 ; CHECK-NEXT: mla r1, r9, r3, r10
433 ; CHECK-NEXT: rsb.w r6, lr, #0
434 ; CHECK-NEXT: vbic q1, q2, q5
435 ; CHECK-NEXT: lsll r2, r11, r6
436 ; CHECK-NEXT: lsll r0, r1, r7
437 ; CHECK-NEXT: vand q2, q3, q5
438 ; CHECK-NEXT: lsll r2, r11, r8
439 ; CHECK-NEXT: lsll r0, r1, r3
440 ; CHECK-NEXT: vorr q1, q2, q1
441 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r2
442 ; CHECK-NEXT: vand q0, q2, q0
443 ; CHECK-NEXT: vorr q0, q0, q4
444 ; CHECK-NEXT: vmov.f32 s1, s2
445 ; CHECK-NEXT: vmov.f32 s2, s4
446 ; CHECK-NEXT: vmov.f32 s3, s6
447 ; CHECK-NEXT: add sp, #8
448 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
449 ; CHECK-NEXT: add sp, #4
450 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
452 %sa = sext <4 x i32> %a to <4 x i64>
453 %sb = zext <4 x i32> %b to <4 x i64>
454 %add = add <4 x i64> %sa, %sb
455 %ashr = ashr <4 x i64> %add, %sb
456 %sub = sub <4 x i64> %ashr, %sb
457 %mul = mul <4 x i64> %sub, %sb
458 %lshr = lshr <4 x i64> %mul, %sa
459 %shl = shl <4 x i64> %lshr, %sb
460 %cmp = icmp eq <4 x i64> %sa, %sb
461 %sel = select <4 x i1> %cmp, <4 x i64> %shl, <4 x i64> %sb
462 %t = trunc <4 x i64> %sel to <4 x i32>
466 define arm_aapcs_vfpcc <8 x i16> @ext_ops_trunc_i16(<8 x i16> %a, <8 x i16> %b) {
467 ; CHECK-LABEL: ext_ops_trunc_i16:
468 ; CHECK: @ %bb.0: @ %entry
469 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
470 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
471 ; CHECK-NEXT: vmovlt.u16 q2, q1
472 ; CHECK-NEXT: vmovlt.s16 q3, q0
473 ; CHECK-NEXT: vadd.i32 q4, q3, q2
474 ; CHECK-NEXT: vneg.s32 q5, q2
475 ; CHECK-NEXT: vshl.s32 q4, q4, q5
476 ; CHECK-NEXT: vneg.s32 q5, q3
477 ; CHECK-NEXT: vsub.i32 q4, q4, q2
478 ; CHECK-NEXT: vcmp.i32 eq, q3, q2
479 ; CHECK-NEXT: vmul.i32 q4, q4, q2
480 ; CHECK-NEXT: vmovlb.u16 q1, q1
481 ; CHECK-NEXT: vshl.u32 q4, q4, q5
482 ; CHECK-NEXT: vmovlb.s16 q0, q0
483 ; CHECK-NEXT: vshl.u32 q4, q4, q2
484 ; CHECK-NEXT: vadd.i32 q3, q0, q1
485 ; CHECK-NEXT: vpsel q2, q4, q2
486 ; CHECK-NEXT: vneg.s32 q4, q1
487 ; CHECK-NEXT: vshl.s32 q3, q3, q4
488 ; CHECK-NEXT: vneg.s32 q4, q0
489 ; CHECK-NEXT: vsub.i32 q3, q3, q1
490 ; CHECK-NEXT: vcmp.i32 eq, q0, q1
491 ; CHECK-NEXT: vmul.i32 q3, q3, q1
492 ; CHECK-NEXT: vshl.u32 q3, q3, q4
493 ; CHECK-NEXT: vshl.u32 q3, q3, q1
494 ; CHECK-NEXT: vpsel q0, q3, q1
495 ; CHECK-NEXT: vmovnt.i32 q0, q2
496 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
499 %sa = sext <8 x i16> %a to <8 x i32>
500 %sb = zext <8 x i16> %b to <8 x i32>
501 %add = add <8 x i32> %sa, %sb
502 %ashr = ashr <8 x i32> %add, %sb
503 %sub = sub <8 x i32> %ashr, %sb
504 %mul = mul <8 x i32> %sub, %sb
505 %lshr = lshr <8 x i32> %mul, %sa
506 %shl = shl <8 x i32> %lshr, %sb
507 %cmp = icmp eq <8 x i32> %sa, %sb
508 %sel = select <8 x i1> %cmp, <8 x i32> %shl, <8 x i32> %sb
509 %t = trunc <8 x i32> %sel to <8 x i16>
513 define arm_aapcs_vfpcc <16 x i8> @ext_ops_trunc_i8(<16 x i8> %a, <16 x i8> %b) {
514 ; CHECK-LABEL: ext_ops_trunc_i8:
515 ; CHECK: @ %bb.0: @ %entry
516 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
517 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
518 ; CHECK-NEXT: vmovlt.u8 q2, q1
519 ; CHECK-NEXT: vmovlt.s8 q3, q0
520 ; CHECK-NEXT: vadd.i16 q4, q3, q2
521 ; CHECK-NEXT: vneg.s16 q5, q2
522 ; CHECK-NEXT: vshl.s16 q4, q4, q5
523 ; CHECK-NEXT: vneg.s16 q5, q3
524 ; CHECK-NEXT: vsub.i16 q4, q4, q2
525 ; CHECK-NEXT: vcmp.i16 eq, q3, q2
526 ; CHECK-NEXT: vmul.i16 q4, q4, q2
527 ; CHECK-NEXT: vmovlb.u8 q1, q1
528 ; CHECK-NEXT: vshl.u16 q4, q4, q5
529 ; CHECK-NEXT: vmovlb.s8 q0, q0
530 ; CHECK-NEXT: vshl.u16 q4, q4, q2
531 ; CHECK-NEXT: vadd.i16 q3, q0, q1
532 ; CHECK-NEXT: vpsel q2, q4, q2
533 ; CHECK-NEXT: vneg.s16 q4, q1
534 ; CHECK-NEXT: vshl.s16 q3, q3, q4
535 ; CHECK-NEXT: vneg.s16 q4, q0
536 ; CHECK-NEXT: vsub.i16 q3, q3, q1
537 ; CHECK-NEXT: vcmp.i16 eq, q0, q1
538 ; CHECK-NEXT: vmul.i16 q3, q3, q1
539 ; CHECK-NEXT: vshl.u16 q3, q3, q4
540 ; CHECK-NEXT: vshl.u16 q3, q3, q1
541 ; CHECK-NEXT: vpsel q0, q3, q1
542 ; CHECK-NEXT: vmovnt.i16 q0, q2
543 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
546 %sa = sext <16 x i8> %a to <16 x i16>
547 %sb = zext <16 x i8> %b to <16 x i16>
548 %add = add <16 x i16> %sa, %sb
549 %ashr = ashr <16 x i16> %add, %sb
550 %sub = sub <16 x i16> %ashr, %sb
551 %mul = mul <16 x i16> %sub, %sb
552 %lshr = lshr <16 x i16> %mul, %sa
553 %shl = shl <16 x i16> %lshr, %sb
554 %cmp = icmp eq <16 x i16> %sa, %sb
555 %sel = select <16 x i1> %cmp, <16 x i16> %shl, <16 x i16> %sb
556 %t = trunc <16 x i16> %sel to <16 x i8>
560 define arm_aapcs_vfpcc <8 x i16> @ext_intrinsics_trunc_i16(<8 x i16> %a, <8 x i16> %b) {
561 ; CHECK-LABEL: ext_intrinsics_trunc_i16:
562 ; CHECK: @ %bb.0: @ %entry
563 ; CHECK-NEXT: .vsave {d8, d9}
564 ; CHECK-NEXT: vpush {d8, d9}
565 ; CHECK-NEXT: vmovlb.u16 q2, q1
566 ; CHECK-NEXT: vmovlb.s16 q3, q0
567 ; CHECK-NEXT: vqadd.s32 q4, q3, q2
568 ; CHECK-NEXT: vmovlt.u16 q1, q1
569 ; CHECK-NEXT: vqadd.u32 q4, q4, q2
570 ; CHECK-NEXT: vmovlt.s16 q0, q0
571 ; CHECK-NEXT: vqsub.s32 q4, q4, q3
572 ; CHECK-NEXT: vqsub.u32 q4, q4, q2
573 ; CHECK-NEXT: vabs.s32 q4, q4
574 ; CHECK-NEXT: vmin.s32 q4, q4, q3
575 ; CHECK-NEXT: vmax.s32 q4, q4, q2
576 ; CHECK-NEXT: vmin.u32 q3, q4, q3
577 ; CHECK-NEXT: vqadd.s32 q4, q0, q1
578 ; CHECK-NEXT: vqadd.u32 q4, q4, q1
579 ; CHECK-NEXT: vqsub.s32 q4, q4, q0
580 ; CHECK-NEXT: vqsub.u32 q4, q4, q1
581 ; CHECK-NEXT: vabs.s32 q4, q4
582 ; CHECK-NEXT: vmin.s32 q4, q4, q0
583 ; CHECK-NEXT: vmax.s32 q4, q4, q1
584 ; CHECK-NEXT: vmin.u32 q0, q4, q0
585 ; CHECK-NEXT: vmax.u32 q1, q0, q1
586 ; CHECK-NEXT: vmax.u32 q0, q3, q2
587 ; CHECK-NEXT: vmovnt.i32 q0, q1
588 ; CHECK-NEXT: vpop {d8, d9}
591 %sa = sext <8 x i16> %a to <8 x i32>
592 %sb = zext <8 x i16> %b to <8 x i32>
593 %sadd = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %sa, <8 x i32> %sb)
594 %uadd = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %sadd, <8 x i32> %sb)
595 %ssub = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %uadd, <8 x i32> %sa)
596 %usub = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %ssub, <8 x i32> %sb)
597 %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %usub, i1 true)
598 %smin = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %abs, <8 x i32> %sa)
599 %smax = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %smin, <8 x i32> %sb)
600 %umin = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %smax, <8 x i32> %sa)
601 %umax = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %umin, <8 x i32> %sb)
602 %t = trunc <8 x i32> %umax to <8 x i16>
606 define arm_aapcs_vfpcc <8 x half> @ext_fpintrinsics_trunc_half(<8 x half> %a, <8 x half> %b) {
607 ; CHECK-LABEL: ext_fpintrinsics_trunc_half:
608 ; CHECK: @ %bb.0: @ %entry
609 ; CHECK-NEXT: .vsave {d8, d9}
610 ; CHECK-NEXT: vpush {d8, d9}
611 ; CHECK-NEXT: vcvtb.f32.f16 q2, q0
612 ; CHECK-NEXT: vcvtb.f32.f16 q4, q1
613 ; CHECK-NEXT: vabs.f32 q3, q2
614 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
615 ; CHECK-NEXT: vminnm.f32 q3, q3, q2
616 ; CHECK-NEXT: vcvtt.f32.f16 q1, q1
617 ; CHECK-NEXT: vmaxnm.f32 q3, q3, q4
618 ; CHECK-NEXT: vfma.f32 q4, q3, q2
619 ; CHECK-NEXT: vabs.f32 q3, q0
620 ; CHECK-NEXT: vminnm.f32 q3, q3, q0
621 ; CHECK-NEXT: vrintp.f32 q2, q4
622 ; CHECK-NEXT: vmaxnm.f32 q3, q3, q1
623 ; CHECK-NEXT: vrintm.f32 q2, q2
624 ; CHECK-NEXT: vfma.f32 q1, q3, q0
625 ; CHECK-NEXT: vrintx.f32 q2, q2
626 ; CHECK-NEXT: vrintp.f32 q0, q1
627 ; CHECK-NEXT: vrinta.f32 q2, q2
628 ; CHECK-NEXT: vrintm.f32 q0, q0
629 ; CHECK-NEXT: vrintz.f32 q2, q2
630 ; CHECK-NEXT: vrintx.f32 q0, q0
631 ; CHECK-NEXT: vrinta.f32 q0, q0
632 ; CHECK-NEXT: vrintz.f32 q1, q0
633 ; CHECK-NEXT: vcvtb.f16.f32 q0, q2
634 ; CHECK-NEXT: vcvtt.f16.f32 q0, q1
635 ; CHECK-NEXT: vpop {d8, d9}
638 %sa = fpext <8 x half> %a to <8 x float>
639 %sb = fpext <8 x half> %b to <8 x float>
640 %abs = call <8 x float> @llvm.fabs.v8f32(<8 x float> %sa)
641 %min = call <8 x float> @llvm.minnum.v8f32(<8 x float> %abs, <8 x float> %sa)
642 %max = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %min, <8 x float> %sb)
643 %fma = call <8 x float> @llvm.fma.v8f32(<8 x float> %max, <8 x float> %sa, <8 x float> %sb)
644 %ceil = call <8 x float> @llvm.ceil.v8f32(<8 x float> %fma)
645 %floor = call <8 x float> @llvm.floor.v8f32(<8 x float> %ceil)
646 %rint = call <8 x float> @llvm.rint.v8f32(<8 x float> %floor)
647 %round = call <8 x float> @llvm.round.v8f32(<8 x float> %rint)
648 %trunc = call <8 x float> @llvm.trunc.v8f32(<8 x float> %round)
649 %t = fptrunc <8 x float> %trunc to <8 x half>
653 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
654 declare <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32>, <8 x i32>)
655 declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>)
656 declare <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32>, <8 x i32>)
657 declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>)
658 declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
659 declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
660 declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
661 declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
662 declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
663 declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
664 declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
665 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
666 declare <8 x float> @llvm.ceil.v8f32(<8 x float>)
667 declare <8 x float> @llvm.floor.v8f32(<8 x float>)
668 declare <8 x float> @llvm.rint.v8f32(<8 x float>)
669 declare <8 x float> @llvm.round.v8f32(<8 x float>)
670 declare <8 x float> @llvm.trunc.v8f32(<8 x float>)