1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc i32 @vqdmulh_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
5 ; CHECK-LABEL: vqdmulh_v16i8:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
8 ; CHECK-NEXT: vaddv.s8 r0, q0
11 %l2 = sext <16 x i8> %s0 to <16 x i32>
12 %l5 = sext <16 x i8> %s1 to <16 x i32>
13 %l6 = mul nsw <16 x i32> %l5, %l2
14 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
15 %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
16 %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
17 %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9)
21 define arm_aapcs_vfpcc <16 x i8> @vqdmulh_v16i8_b(<16 x i8> %s0, <16 x i8> %s1) {
22 ; CHECK-LABEL: vqdmulh_v16i8_b:
23 ; CHECK: @ %bb.0: @ %entry
24 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
27 %l2 = sext <16 x i8> %s0 to <16 x i32>
28 %l5 = sext <16 x i8> %s1 to <16 x i32>
29 %l6 = mul nsw <16 x i32> %l5, %l2
30 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
31 %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
32 %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
33 %l10 = trunc <16 x i32> %l9 to <16 x i8>
37 define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) {
38 ; CHECK-LABEL: vqdmulh_v8i8_b:
39 ; CHECK: @ %bb.0: @ %entry
40 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
41 ; CHECK-NEXT: vmovlb.s8 q0, q0
44 %l2 = sext <8 x i8> %s0 to <8 x i32>
45 %l5 = sext <8 x i8> %s1 to <8 x i32>
46 %l6 = mul nsw <8 x i32> %l5, %l2
47 %l7 = ashr <8 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
48 %l8 = icmp slt <8 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
49 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
50 %l10 = trunc <8 x i32> %l9 to <8 x i8>
54 define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) {
55 ; CHECK-LABEL: vqdmulh_v4i8_b:
56 ; CHECK: @ %bb.0: @ %entry
57 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
58 ; CHECK-NEXT: vmovlb.s8 q0, q0
59 ; CHECK-NEXT: vmovlb.s16 q0, q0
62 %l2 = sext <4 x i8> %s0 to <4 x i32>
63 %l5 = sext <4 x i8> %s1 to <4 x i32>
64 %l6 = mul nsw <4 x i32> %l5, %l2
65 %l7 = ashr <4 x i32> %l6, <i32 7, i32 7, i32 7, i32 7>
66 %l8 = icmp slt <4 x i32> %l7, <i32 127, i32 127, i32 127, i32 127>
67 %l9 = select <4 x i1> %l8, <4 x i32> %l7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
68 %l10 = trunc <4 x i32> %l9 to <4 x i8>
72 define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) {
73 ; CHECK-LABEL: vqdmulh_v32i8_b:
74 ; CHECK: @ %bb.0: @ %entry
75 ; CHECK-NEXT: vqdmulh.s8 q0, q2, q0
76 ; CHECK-NEXT: vqdmulh.s8 q1, q3, q1
79 %l2 = sext <32 x i8> %s0 to <32 x i32>
80 %l5 = sext <32 x i8> %s1 to <32 x i32>
81 %l6 = mul nsw <32 x i32> %l5, %l2
82 %l7 = ashr <32 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
83 %l8 = icmp slt <32 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
84 %l9 = select <32 x i1> %l8, <32 x i32> %l7, <32 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
85 %l10 = trunc <32 x i32> %l9 to <32 x i8>
89 define arm_aapcs_vfpcc i32 @vqdmulh_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
90 ; CHECK-LABEL: vqdmulh_v8i16:
91 ; CHECK: @ %bb.0: @ %entry
92 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
93 ; CHECK-NEXT: vaddv.s16 r0, q0
96 %l2 = sext <8 x i16> %s0 to <8 x i32>
97 %l5 = sext <8 x i16> %s1 to <8 x i32>
98 %l6 = mul nsw <8 x i32> %l5, %l2
99 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
100 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
101 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
102 %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9)
106 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_b(<8 x i16> %s0, <8 x i16> %s1) {
107 ; CHECK-LABEL: vqdmulh_v8i16_b:
108 ; CHECK: @ %bb.0: @ %entry
109 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
112 %l2 = sext <8 x i16> %s0 to <8 x i32>
113 %l5 = sext <8 x i16> %s1 to <8 x i32>
114 %l6 = mul nsw <8 x i32> %l5, %l2
115 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
116 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
117 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
118 %l10 = trunc <8 x i32> %l9 to <8 x i16>
122 define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) {
123 ; CHECK-LABEL: vqdmulh_v4i16_b:
124 ; CHECK: @ %bb.0: @ %entry
125 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
126 ; CHECK-NEXT: vmovlb.s16 q0, q0
129 %l2 = sext <4 x i16> %s0 to <4 x i32>
130 %l5 = sext <4 x i16> %s1 to <4 x i32>
131 %l6 = mul nsw <4 x i32> %l5, %l2
132 %l7 = ashr <4 x i32> %l6, <i32 15, i32 15, i32 15, i32 15>
133 %l4 = icmp slt <4 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767>
134 %l9 = select <4 x i1> %l4, <4 x i32> %l7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
135 %l10 = trunc <4 x i32> %l9 to <4 x i16>
139 define arm_aapcs_vfpcc <16 x i16> @vqdmulh_v16i16_b(<16 x i16> %s0, <16 x i16> %s1) {
140 ; CHECK-LABEL: vqdmulh_v16i16_b:
141 ; CHECK: @ %bb.0: @ %entry
142 ; CHECK-NEXT: vqdmulh.s16 q0, q2, q0
143 ; CHECK-NEXT: vqdmulh.s16 q1, q3, q1
146 %l2 = sext <16 x i16> %s0 to <16 x i32>
147 %l5 = sext <16 x i16> %s1 to <16 x i32>
148 %l6 = mul nsw <16 x i32> %l5, %l2
149 %l7 = ashr <16 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
150 %l16 = icmp slt <16 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
151 %l9 = select <16 x i1> %l16, <16 x i32> %l7, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
152 %l10 = trunc <16 x i32> %l9 to <16 x i16>
156 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) {
157 ; CHECK-LABEL: vqdmulh_v8i16_c:
158 ; CHECK: @ %bb.0: @ %entry
159 ; CHECK-NEXT: vmov q2, q0
160 ; CHECK-NEXT: vmov.u16 r0, q0[2]
161 ; CHECK-NEXT: vmov.u16 r1, q0[0]
162 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
163 ; CHECK-NEXT: vmov.u16 r0, q2[3]
164 ; CHECK-NEXT: vmov.u16 r1, q2[1]
165 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
166 ; CHECK-NEXT: vmov.u16 r0, q1[2]
167 ; CHECK-NEXT: vmov.u16 r1, q1[0]
168 ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
169 ; CHECK-NEXT: vmov.u16 r0, q1[3]
170 ; CHECK-NEXT: vmov.u16 r1, q1[1]
171 ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
172 ; CHECK-NEXT: vmullb.s16 q0, q3, q0
173 ; CHECK-NEXT: vshl.i32 q0, q0, #10
174 ; CHECK-NEXT: vshr.s32 q0, q0, #10
175 ; CHECK-NEXT: vshr.s32 q3, q0, #15
176 ; CHECK-NEXT: vmov r0, r1, d6
177 ; CHECK-NEXT: vmov.16 q0[0], r0
178 ; CHECK-NEXT: vmov.16 q0[1], r1
179 ; CHECK-NEXT: vmov r0, r1, d7
180 ; CHECK-NEXT: vmov.16 q0[2], r0
181 ; CHECK-NEXT: vmov.u16 r0, q2[6]
182 ; CHECK-NEXT: vmov.16 q0[3], r1
183 ; CHECK-NEXT: vmov.u16 r1, q2[4]
184 ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
185 ; CHECK-NEXT: vmov.u16 r0, q2[7]
186 ; CHECK-NEXT: vmov.u16 r1, q2[5]
187 ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
188 ; CHECK-NEXT: vmov.u16 r0, q1[6]
189 ; CHECK-NEXT: vmov.u16 r1, q1[4]
190 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
191 ; CHECK-NEXT: vmov.u16 r0, q1[7]
192 ; CHECK-NEXT: vmov.u16 r1, q1[5]
193 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
194 ; CHECK-NEXT: vmullb.s16 q1, q2, q3
195 ; CHECK-NEXT: vshl.i32 q1, q1, #10
196 ; CHECK-NEXT: vshr.s32 q1, q1, #10
197 ; CHECK-NEXT: vshr.s32 q1, q1, #15
198 ; CHECK-NEXT: vmov r0, r1, d2
199 ; CHECK-NEXT: vmov.16 q0[4], r0
200 ; CHECK-NEXT: vmov.16 q0[5], r1
201 ; CHECK-NEXT: vmov r0, r1, d3
202 ; CHECK-NEXT: vmov.16 q0[6], r0
203 ; CHECK-NEXT: vmov.16 q0[7], r1
206 %l2 = sext <8 x i16> %s0 to <8 x i22>
207 %l5 = sext <8 x i16> %s1 to <8 x i22>
208 %l6 = mul nsw <8 x i22> %l5, %l2
209 %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15>
210 %l8 = icmp slt <8 x i22> %l7, <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>
211 %l9 = select <8 x i1> %l8, <8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>
212 %l10 = trunc <8 x i22> %l9 to <8 x i16>
216 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) {
217 ; CHECK-LABEL: vqdmulh_v8i16_interleaved:
218 ; CHECK: @ %bb.0: @ %entry
219 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
222 %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
223 %1 = sext <8 x i16> %0 to <8 x i32>
224 %l2 = sext <8 x i16> %s0 to <8 x i32>
225 %2 = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
226 %3 = sext <8 x i16> %2 to <8 x i32>
227 %l5 = sext <8 x i16> %s1 to <8 x i32>
228 %l6 = mul nsw <8 x i32> %3, %1
229 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
230 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
231 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
232 %l10 = trunc <8 x i32> %l9 to <8 x i16>
233 %4 = shufflevector <8 x i16> %l10, <8 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
237 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) {
238 ; CHECK-LABEL: vqdmulh_v8i16_interleaved2:
240 ; CHECK-NEXT: vqdmulh.s16 q2, q1, q0
241 ; CHECK-NEXT: vrev32.16 q1, q1
242 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
243 ; CHECK-NEXT: vmovnt.i32 q2, q0
244 ; CHECK-NEXT: vmov q0, q2
246 %s0 = trunc <4 x i32> %s0a to <4 x i16>
247 %strided.vec = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
248 %strided.vec44 = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
249 %l7 = sext <4 x i16> %strided.vec to <4 x i32>
250 %l8 = sext <4 x i16> %s0 to <4 x i32>
251 %l9 = mul nsw <4 x i32> %l7, %l8
252 %l10 = ashr <4 x i32> %l9, <i32 15, i32 15, i32 15, i32 15>
253 %l11 = icmp slt <4 x i32> %l10, <i32 32767, i32 32767, i32 32767, i32 32767>
254 %l12 = select <4 x i1> %l11, <4 x i32> %l10, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
255 %l13 = trunc <4 x i32> %l12 to <4 x i16>
256 %l14 = sext <4 x i16> %strided.vec44 to <4 x i32>
257 %l15 = mul nsw <4 x i32> %l14, %l8
258 %l16 = ashr <4 x i32> %l15, <i32 15, i32 15, i32 15, i32 15>
259 %l17 = icmp slt <4 x i32> %l16, <i32 32767, i32 32767, i32 32767, i32 32767>
260 %l18 = select <4 x i1> %l17, <4 x i32> %l16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
261 %l19 = trunc <4 x i32> %l18 to <4 x i16>
262 %interleaved.vec = shufflevector <4 x i16> %l13, <4 x i16> %l19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
263 ret <8 x i16> %interleaved.vec
266 define arm_aapcs_vfpcc i64 @vqdmulh_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
267 ; CHECK-LABEL: vqdmulh_v4i32:
268 ; CHECK: @ %bb.0: @ %entry
269 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
270 ; CHECK-NEXT: vaddlv.s32 r0, r1, q0
273 %l2 = sext <4 x i32> %s0 to <4 x i64>
274 %l5 = sext <4 x i32> %s1 to <4 x i64>
275 %l6 = mul nsw <4 x i64> %l5, %l2
276 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
277 %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
278 %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
279 %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9)
283 define arm_aapcs_vfpcc <4 x i32> @vqdmulh_v4i32_b(<4 x i32> %s0, <4 x i32> %s1) {
284 ; CHECK-LABEL: vqdmulh_v4i32_b:
285 ; CHECK: @ %bb.0: @ %entry
286 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
289 %l2 = sext <4 x i32> %s0 to <4 x i64>
290 %l5 = sext <4 x i32> %s1 to <4 x i64>
291 %l6 = mul nsw <4 x i64> %l5, %l2
292 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
293 %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
294 %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
295 %l10 = trunc <4 x i64> %l9 to <4 x i32>
299 define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) {
300 ; CHECK-LABEL: vqdmulh_v2i32_b:
301 ; CHECK: @ %bb.0: @ %entry
302 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
303 ; CHECK-NEXT: vmov r0, s2
304 ; CHECK-NEXT: vmov r1, s0
305 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
306 ; CHECK-NEXT: asrs r0, r0, #31
307 ; CHECK-NEXT: asrs r1, r1, #31
308 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
311 %l2 = sext <2 x i32> %s0 to <2 x i64>
312 %l5 = sext <2 x i32> %s1 to <2 x i64>
313 %l6 = mul nsw <2 x i64> %l5, %l2
314 %l7 = ashr <2 x i64> %l6, <i64 31, i64 31>
315 %l8 = icmp slt <2 x i64> %l7, <i64 2147483647, i64 2147483647>
316 %l9 = select <2 x i1> %l8, <2 x i64> %l7, <2 x i64> <i64 2147483647, i64 2147483647>
317 %l10 = trunc <2 x i64> %l9 to <2 x i32>
321 define arm_aapcs_vfpcc <8 x i32> @vqdmulh_v8i32_b(<8 x i32> %s0, <8 x i32> %s1) {
322 ; CHECK-LABEL: vqdmulh_v8i32_b:
323 ; CHECK: @ %bb.0: @ %entry
324 ; CHECK-NEXT: vqdmulh.s32 q0, q2, q0
325 ; CHECK-NEXT: vqdmulh.s32 q1, q3, q1
328 %l2 = sext <8 x i32> %s0 to <8 x i64>
329 %l5 = sext <8 x i32> %s1 to <8 x i64>
330 %l6 = mul nsw <8 x i64> %l5, %l2
331 %l7 = ashr <8 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
332 %l8 = icmp slt <8 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
333 %l9 = select <8 x i1> %l8, <8 x i64> %l7, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
334 %l10 = trunc <8 x i64> %l9 to <8 x i32>
338 define arm_aapcs_vfpcc <16 x i32> @vqdmulh_v16i32_b(<16 x i32> %s0, <16 x i32> %s1) {
339 ; CHECK-LABEL: vqdmulh_v16i32_b:
340 ; CHECK: @ %bb.0: @ %entry
341 ; CHECK-NEXT: .vsave {d8, d9}
342 ; CHECK-NEXT: vpush {d8, d9}
343 ; CHECK-NEXT: add r0, sp, #16
344 ; CHECK-NEXT: vldrw.u32 q4, [r0]
345 ; CHECK-NEXT: add r0, sp, #32
346 ; CHECK-NEXT: vqdmulh.s32 q0, q4, q0
347 ; CHECK-NEXT: vldrw.u32 q4, [r0]
348 ; CHECK-NEXT: add r0, sp, #48
349 ; CHECK-NEXT: vqdmulh.s32 q1, q4, q1
350 ; CHECK-NEXT: vldrw.u32 q4, [r0]
351 ; CHECK-NEXT: add r0, sp, #64
352 ; CHECK-NEXT: vqdmulh.s32 q2, q4, q2
353 ; CHECK-NEXT: vldrw.u32 q4, [r0]
354 ; CHECK-NEXT: vqdmulh.s32 q3, q4, q3
355 ; CHECK-NEXT: vpop {d8, d9}
358 %l2 = sext <16 x i32> %s0 to <16 x i64>
359 %l5 = sext <16 x i32> %s1 to <16 x i64>
360 %l6 = mul nsw <16 x i64> %l5, %l2
361 %l7 = ashr <16 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
362 %l8 = icmp slt <16 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
363 %l9 = select <16 x i1> %l8, <16 x i64> %l7, <16 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
364 %l10 = trunc <16 x i64> %l9 to <16 x i32>
370 define void @vqdmulh_loop_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) local_unnamed_addr #0 {
371 ; CHECK-LABEL: vqdmulh_loop_i8:
372 ; CHECK: @ %bb.0: @ %entry
373 ; CHECK-NEXT: .save {r7, lr}
374 ; CHECK-NEXT: push {r7, lr}
375 ; CHECK-NEXT: mov.w lr, #64
376 ; CHECK-NEXT: .LBB17_1: @ %vector.body
377 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
378 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
379 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
380 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
381 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
382 ; CHECK-NEXT: le lr, .LBB17_1
383 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
384 ; CHECK-NEXT: pop {r7, pc}
386 br label %vector.body
388 vector.body: ; preds = %vector.body, %entry
389 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
390 %0 = getelementptr inbounds i8, i8* %x, i32 %index
391 %1 = bitcast i8* %0 to <16 x i8>*
392 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
393 %2 = sext <16 x i8> %wide.load to <16 x i32>
394 %3 = getelementptr inbounds i8, i8* %y, i32 %index
395 %4 = bitcast i8* %3 to <16 x i8>*
396 %wide.load26 = load <16 x i8>, <16 x i8>* %4, align 1
397 %5 = sext <16 x i8> %wide.load26 to <16 x i32>
398 %6 = mul nsw <16 x i32> %5, %2
399 %7 = ashr <16 x i32> %6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
400 %8 = icmp slt <16 x i32> %7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
401 %9 = select <16 x i1> %8, <16 x i32> %7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
402 %10 = trunc <16 x i32> %9 to <16 x i8>
403 %11 = getelementptr inbounds i8, i8* %z, i32 %index
404 %12 = bitcast i8* %11 to <16 x i8>*
405 store <16 x i8> %10, <16 x i8>* %12, align 1
406 %index.next = add i32 %index, 16
407 %13 = icmp eq i32 %index.next, 1024
408 br i1 %13, label %for.cond.cleanup, label %vector.body
410 for.cond.cleanup: ; preds = %vector.body
414 define void @vqdmulh_loop_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
415 ; CHECK-LABEL: vqdmulh_loop_i16:
416 ; CHECK: @ %bb.0: @ %entry
417 ; CHECK-NEXT: .save {r7, lr}
418 ; CHECK-NEXT: push {r7, lr}
419 ; CHECK-NEXT: mov.w lr, #128
420 ; CHECK-NEXT: .LBB18_1: @ %vector.body
421 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
422 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
423 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
424 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
425 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
426 ; CHECK-NEXT: le lr, .LBB18_1
427 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
428 ; CHECK-NEXT: pop {r7, pc}
430 br label %vector.body
432 vector.body: ; preds = %vector.body, %entry
433 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
434 %0 = getelementptr inbounds i16, i16* %x, i32 %index
435 %1 = bitcast i16* %0 to <8 x i16>*
436 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
437 %2 = sext <8 x i16> %wide.load to <8 x i32>
438 %3 = getelementptr inbounds i16, i16* %y, i32 %index
439 %4 = bitcast i16* %3 to <8 x i16>*
440 %wide.load30 = load <8 x i16>, <8 x i16>* %4, align 2
441 %5 = sext <8 x i16> %wide.load30 to <8 x i32>
442 %6 = mul nsw <8 x i32> %5, %2
443 %7 = ashr <8 x i32> %6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
444 %8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
445 %9 = select <8 x i1> %8, <8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
446 %10 = trunc <8 x i32> %9 to <8 x i16>
447 %11 = getelementptr inbounds i16, i16* %z, i32 %index
448 %12 = bitcast i16* %11 to <8 x i16>*
449 store <8 x i16> %10, <8 x i16>* %12, align 2
450 %index.next = add i32 %index, 8
451 %13 = icmp eq i32 %index.next, 1024
452 br i1 %13, label %for.cond.cleanup, label %vector.body
454 for.cond.cleanup: ; preds = %vector.body
458 define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
459 ; CHECK-LABEL: vqdmulh_loop_i32:
460 ; CHECK: @ %bb.0: @ %entry
461 ; CHECK-NEXT: .save {r7, lr}
462 ; CHECK-NEXT: push {r7, lr}
463 ; CHECK-NEXT: mov.w lr, #256
464 ; CHECK-NEXT: .LBB19_1: @ %vector.body
465 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
466 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
467 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
468 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
469 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
470 ; CHECK-NEXT: le lr, .LBB19_1
471 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
472 ; CHECK-NEXT: pop {r7, pc}
474 br label %vector.body
476 vector.body: ; preds = %vector.body, %entry
477 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
478 %0 = getelementptr inbounds i32, i32* %x, i32 %index
479 %1 = bitcast i32* %0 to <4 x i32>*
480 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
481 %2 = sext <4 x i32> %wide.load to <4 x i64>
482 %3 = getelementptr inbounds i32, i32* %y, i32 %index
483 %4 = bitcast i32* %3 to <4 x i32>*
484 %wide.load30 = load <4 x i32>, <4 x i32>* %4, align 4
485 %5 = sext <4 x i32> %wide.load30 to <4 x i64>
486 %6 = mul nsw <4 x i64> %5, %2
487 %7 = ashr <4 x i64> %6, <i64 31, i64 31, i64 31, i64 31>
488 %8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
489 %9 = select <4 x i1> %8, <4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
490 %10 = trunc <4 x i64> %9 to <4 x i32>
491 %11 = getelementptr inbounds i32, i32* %z, i32 %index
492 %12 = bitcast i32* %11 to <4 x i32>*
493 store <4 x i32> %10, <4 x i32>* %12, align 4
494 %index.next = add i32 %index, 4
495 %13 = icmp eq i32 %index.next, 1024
496 br i1 %13, label %for.cond.cleanup, label %vector.body
498 for.cond.cleanup: ; preds = %vector.body
502 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
503 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
504 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)