1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
3 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVEFP
5 define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) {
6 ; CHECK-LABEL: fpext_4:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vcvtt.f32.f16 s3, s1
9 ; CHECK-NEXT: vcvtb.f32.f16 s2, s1
10 ; CHECK-NEXT: vcvtt.f32.f16 s1, s0
11 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0
14 %out = fpext <4 x half> %src1 to <4 x float>
18 define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
19 ; CHECK-LABEL: fpext_8:
20 ; CHECK: @ %bb.0: @ %entry
21 ; CHECK-NEXT: vcvtt.f32.f16 s11, s1
22 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1
23 ; CHECK-NEXT: vcvtt.f32.f16 s9, s0
24 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0
25 ; CHECK-NEXT: vcvtt.f32.f16 s7, s3
26 ; CHECK-NEXT: vcvtb.f32.f16 s6, s3
27 ; CHECK-NEXT: vcvtt.f32.f16 s5, s2
28 ; CHECK-NEXT: vcvtb.f32.f16 s4, s2
29 ; CHECK-NEXT: vmov q0, q2
32 %out = fpext <8 x half> %src1 to <8 x float>
37 define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) {
38 ; CHECK-LABEL: fptrunc_4:
39 ; CHECK: @ %bb.0: @ %entry
40 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0
41 ; CHECK-NEXT: vcvtt.f16.f32 s0, s1
42 ; CHECK-NEXT: vcvtb.f16.f32 s1, s2
43 ; CHECK-NEXT: vcvtt.f16.f32 s1, s3
46 %out = fptrunc <4 x float> %src1 to <4 x half>
50 define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) {
51 ; CHECK-LABEL: fptrunc_8:
52 ; CHECK: @ %bb.0: @ %entry
53 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0
54 ; CHECK-NEXT: vcvtt.f16.f32 s0, s1
55 ; CHECK-NEXT: vcvtb.f16.f32 s1, s2
56 ; CHECK-NEXT: vcvtb.f16.f32 s2, s4
57 ; CHECK-NEXT: vcvtt.f16.f32 s1, s3
58 ; CHECK-NEXT: vcvtb.f16.f32 s3, s6
59 ; CHECK-NEXT: vcvtt.f16.f32 s2, s5
60 ; CHECK-NEXT: vcvtt.f16.f32 s3, s7
63 %out = fptrunc <8 x float> %src1 to <8 x half>
68 define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) {
69 ; CHECK-MVE-LABEL: shuffle_trunc1:
70 ; CHECK-MVE: @ %bb.0: @ %entry
71 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
72 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1
73 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2
74 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3
75 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4
76 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5
77 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6
78 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7
79 ; CHECK-MVE-NEXT: bx lr
81 ; CHECK-MVEFP-LABEL: shuffle_trunc1:
82 ; CHECK-MVEFP: @ %bb.0: @ %entry
83 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
84 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1
85 ; CHECK-MVEFP-NEXT: bx lr
87 %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
88 %out = fptrunc <8 x float> %strided.vec to <8 x half>
92 define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) {
93 ; CHECK-MVE-LABEL: shuffle_trunc2:
94 ; CHECK-MVE: @ %bb.0: @ %entry
95 ; CHECK-MVE-NEXT: vmov q2, q0
96 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4
97 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5
98 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6
99 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7
100 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8
101 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9
102 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10
103 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11
104 ; CHECK-MVE-NEXT: bx lr
106 ; CHECK-MVEFP-LABEL: shuffle_trunc2:
107 ; CHECK-MVEFP: @ %bb.0: @ %entry
108 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1
109 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q0
110 ; CHECK-MVEFP-NEXT: vmov q0, q1
111 ; CHECK-MVEFP-NEXT: bx lr
113 %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
114 %out = fptrunc <8 x float> %strided.vec to <8 x half>
118 define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) {
119 ; CHECK-MVE-LABEL: shuffle_trunc3:
120 ; CHECK-MVE: @ %bb.0: @ %entry
121 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
122 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1
123 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2
124 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3
125 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4
126 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s5, s5
127 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s6, s6
128 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s7, s7
129 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8
130 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9
131 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10
132 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11
133 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s4, s12
134 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s5, s13
135 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s6, s14
136 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s7, s15
137 ; CHECK-MVE-NEXT: bx lr
139 ; CHECK-MVEFP-LABEL: shuffle_trunc3:
140 ; CHECK-MVEFP: @ %bb.0: @ %entry
141 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
142 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1
143 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2
144 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3
145 ; CHECK-MVEFP-NEXT: bx lr
147 %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
148 %out = fptrunc <16 x float> %strided.vec to <16 x half>
152 define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) {
153 ; CHECK-MVE-LABEL: shuffle_trunc4:
154 ; CHECK-MVE: @ %bb.0: @ %entry
155 ; CHECK-MVE-NEXT: .vsave {d8, d9}
156 ; CHECK-MVE-NEXT: vpush {d8, d9}
157 ; CHECK-MVE-NEXT: vmov q4, q0
158 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8
159 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9
160 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10
161 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11
162 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s12
163 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s9, s13
164 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s10, s14
165 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s11, s15
166 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s8, s4
167 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s9, s5
168 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s10, s6
169 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s11, s7
170 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s16
171 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s17
172 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s18
173 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s19
174 ; CHECK-MVE-NEXT: vmov q1, q2
175 ; CHECK-MVE-NEXT: vpop {d8, d9}
176 ; CHECK-MVE-NEXT: bx lr
178 ; CHECK-MVEFP-LABEL: shuffle_trunc4:
179 ; CHECK-MVEFP: @ %bb.0: @ %entry
180 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2
181 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3
182 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q2, q0
183 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q3, q1
184 ; CHECK-MVEFP-NEXT: vmov q0, q2
185 ; CHECK-MVEFP-NEXT: vmov q1, q3
186 ; CHECK-MVEFP-NEXT: bx lr
188 %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
189 %out = fptrunc <16 x float> %strided.vec to <16 x half>
193 define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) {
194 ; CHECK-MVE-LABEL: shuffle_trunc5:
195 ; CHECK-MVE: @ %bb.0: @ %entry
196 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
197 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1
198 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2
199 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3
200 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4
201 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5
202 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6
203 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7
204 ; CHECK-MVE-NEXT: bx lr
206 ; CHECK-MVEFP-LABEL: shuffle_trunc5:
207 ; CHECK-MVEFP: @ %bb.0: @ %entry
208 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
209 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1
210 ; CHECK-MVEFP-NEXT: bx lr
212 %out1 = fptrunc <4 x float> %src1 to <4 x half>
213 %out2 = fptrunc <4 x float> %src2 to <4 x half>
214 %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
218 define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) {
219 ; CHECK-MVE-LABEL: shuffle_trunc6:
220 ; CHECK-MVE: @ %bb.0: @ %entry
221 ; CHECK-MVE-NEXT: vmov q2, q0
222 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4
223 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5
224 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6
225 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7
226 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8
227 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9
228 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10
229 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11
230 ; CHECK-MVE-NEXT: bx lr
232 ; CHECK-MVEFP-LABEL: shuffle_trunc6:
233 ; CHECK-MVEFP: @ %bb.0: @ %entry
234 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1
235 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q0
236 ; CHECK-MVEFP-NEXT: vmov q0, q1
237 ; CHECK-MVEFP-NEXT: bx lr
239 %out1 = fptrunc <4 x float> %src1 to <4 x half>
240 %out2 = fptrunc <4 x float> %src2 to <4 x half>
241 %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
245 define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) {
246 ; CHECK-MVE-LABEL: shuffle_trunc7:
247 ; CHECK-MVE: @ %bb.0: @ %entry
248 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
249 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1
250 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2
251 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3
252 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4
253 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s5, s5
254 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s6, s6
255 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s7, s7
256 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8
257 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9
258 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10
259 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11
260 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s4, s12
261 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s5, s13
262 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s6, s14
263 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s7, s15
264 ; CHECK-MVE-NEXT: bx lr
266 ; CHECK-MVEFP-LABEL: shuffle_trunc7:
267 ; CHECK-MVEFP: @ %bb.0: @ %entry
268 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
269 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1
270 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2
271 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3
272 ; CHECK-MVEFP-NEXT: bx lr
274 %out1 = fptrunc <8 x float> %src1 to <8 x half>
275 %out2 = fptrunc <8 x float> %src2 to <8 x half>
276 %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
280 define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) {
281 ; CHECK-MVE-LABEL: shuffle_trunc8:
282 ; CHECK-MVE: @ %bb.0: @ %entry
283 ; CHECK-MVE-NEXT: .vsave {d8, d9}
284 ; CHECK-MVE-NEXT: vpush {d8, d9}
285 ; CHECK-MVE-NEXT: vmov q4, q0
286 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8
287 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9
288 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10
289 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11
290 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s12
291 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s9, s13
292 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s10, s14
293 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s11, s15
294 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s8, s4
295 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s9, s5
296 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s10, s6
297 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s11, s7
298 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s16
299 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s17
300 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s18
301 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s19
302 ; CHECK-MVE-NEXT: vmov q1, q2
303 ; CHECK-MVE-NEXT: vpop {d8, d9}
304 ; CHECK-MVE-NEXT: bx lr
306 ; CHECK-MVEFP-LABEL: shuffle_trunc8:
307 ; CHECK-MVEFP: @ %bb.0: @ %entry
308 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2
309 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3
310 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q2, q0
311 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q3, q1
312 ; CHECK-MVEFP-NEXT: vmov q0, q2
313 ; CHECK-MVEFP-NEXT: vmov q1, q3
314 ; CHECK-MVEFP-NEXT: bx lr
316 %out1 = fptrunc <8 x float> %src1 to <8 x half>
317 %out2 = fptrunc <8 x float> %src2 to <8 x half>
318 %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
325 define arm_aapcs_vfpcc <4 x float> @load_ext_4(ptr %src) {
326 ; CHECK-MVE-LABEL: load_ext_4:
327 ; CHECK-MVE: @ %bb.0: @ %entry
328 ; CHECK-MVE-NEXT: ldrd r0, r1, [r0]
329 ; CHECK-MVE-NEXT: vmov.32 q0[0], r0
330 ; CHECK-MVE-NEXT: vmov.32 q0[1], r1
331 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s1
332 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s1
333 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s0
334 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0
335 ; CHECK-MVE-NEXT: bx lr
337 ; CHECK-MVEFP-LABEL: load_ext_4:
338 ; CHECK-MVEFP: @ %bb.0: @ %entry
339 ; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0]
340 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0
341 ; CHECK-MVEFP-NEXT: bx lr
343 %wide.load = load <4 x half>, ptr %src, align 4
344 %e = fpext <4 x half> %wide.load to <4 x float>
348 define arm_aapcs_vfpcc <8 x float> @load_ext_8(ptr %src) {
349 ; CHECK-MVE-LABEL: load_ext_8:
350 ; CHECK-MVE: @ %bb.0: @ %entry
351 ; CHECK-MVE-NEXT: vldrw.u32 q2, [r0]
352 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s9
353 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s9
354 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s8
355 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s8
356 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s11
357 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s6, s11
358 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s5, s10
359 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s4, s10
360 ; CHECK-MVE-NEXT: bx lr
362 ; CHECK-MVEFP-LABEL: load_ext_8:
363 ; CHECK-MVEFP: @ %bb.0: @ %entry
364 ; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0]
365 ; CHECK-MVEFP-NEXT: vldrh.u32 q1, [r0, #8]
366 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0
367 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q1, q1
368 ; CHECK-MVEFP-NEXT: bx lr
370 %wide.load = load <8 x half>, ptr %src, align 4
371 %e = fpext <8 x half> %wide.load to <8 x float>
375 define arm_aapcs_vfpcc <16 x float> @load_ext_16(ptr %src) {
376 ; CHECK-MVE-LABEL: load_ext_16:
377 ; CHECK-MVE: @ %bb.0: @ %entry
378 ; CHECK-MVE-NEXT: .vsave {d8, d9}
379 ; CHECK-MVE-NEXT: vpush {d8, d9}
380 ; CHECK-MVE-NEXT: vldrw.u32 q2, [r0], #16
381 ; CHECK-MVE-NEXT: vldrw.u32 q4, [r0]
382 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s9
383 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s9
384 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s8
385 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s8
386 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s11
387 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s6, s11
388 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s5, s10
389 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s4, s10
390 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s11, s17
391 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s10, s17
392 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s9, s16
393 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s8, s16
394 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s15, s19
395 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s14, s19
396 ; CHECK-MVE-NEXT: vcvtt.f32.f16 s13, s18
397 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s12, s18
398 ; CHECK-MVE-NEXT: vpop {d8, d9}
399 ; CHECK-MVE-NEXT: bx lr
401 ; CHECK-MVEFP-LABEL: load_ext_16:
402 ; CHECK-MVEFP: @ %bb.0: @ %entry
403 ; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0]
404 ; CHECK-MVEFP-NEXT: vldrh.u32 q1, [r0, #8]
405 ; CHECK-MVEFP-NEXT: vldrh.u32 q2, [r0, #16]
406 ; CHECK-MVEFP-NEXT: vldrh.u32 q3, [r0, #24]
407 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0
408 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q1, q1
409 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q2, q2
410 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q3, q3
411 ; CHECK-MVEFP-NEXT: bx lr
413 %wide.load = load <16 x half>, ptr %src, align 4
414 %e = fpext <16 x half> %wide.load to <16 x float>
418 define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(ptr %src) {
419 ; CHECK-MVE-LABEL: load_shuffleext_8:
420 ; CHECK-MVE: @ %bb.0: @ %entry
421 ; CHECK-MVE-NEXT: vldrw.u32 q0, [r0]
422 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s3, s3
423 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2
424 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s1, s1
425 ; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0
426 ; CHECK-MVE-NEXT: bx lr
428 ; CHECK-MVEFP-LABEL: load_shuffleext_8:
429 ; CHECK-MVEFP: @ %bb.0: @ %entry
430 ; CHECK-MVEFP-NEXT: vldrw.u32 q0, [r0]
431 ; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0
432 ; CHECK-MVEFP-NEXT: bx lr
434 %wide.load = load <8 x half>, ptr %src, align 4
435 %sh = shufflevector <8 x half> %wide.load, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
436 %e = fpext <4 x half> %sh to <4 x float>
440 define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(ptr %src) {
441 ; CHECK-LABEL: load_shuffleext_16:
442 ; CHECK: @ %bb.0: @ %entry
443 ; CHECK-NEXT: vld20.16 {q2, q3}, [r0]
444 ; CHECK-NEXT: vld21.16 {q2, q3}, [r0]
445 ; CHECK-NEXT: vcvtt.f32.f16 s3, s9
446 ; CHECK-NEXT: vcvtb.f32.f16 s2, s9
447 ; CHECK-NEXT: vcvtt.f32.f16 s1, s8
448 ; CHECK-NEXT: vcvtb.f32.f16 s0, s8
449 ; CHECK-NEXT: vcvtt.f32.f16 s7, s11
450 ; CHECK-NEXT: vcvtb.f32.f16 s6, s11
451 ; CHECK-NEXT: vcvtt.f32.f16 s5, s10
452 ; CHECK-NEXT: vcvtb.f32.f16 s4, s10
455 %wide.load = load <16 x half>, ptr %src, align 4
456 %sh = shufflevector <16 x half> %wide.load, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
457 %e = fpext <8 x half> %sh to <8 x float>
464 define arm_aapcs_vfpcc void @store_trunc_4(ptr %src, <4 x float> %val) {
465 ; CHECK-MVE-LABEL: store_trunc_4:
466 ; CHECK-MVE: @ %bb.0: @ %entry
467 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
468 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1
469 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2
470 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3
471 ; CHECK-MVE-NEXT: vmov r1, r2, d0
472 ; CHECK-MVE-NEXT: strd r1, r2, [r0]
473 ; CHECK-MVE-NEXT: bx lr
475 ; CHECK-MVEFP-LABEL: store_trunc_4:
476 ; CHECK-MVEFP: @ %bb.0: @ %entry
477 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
478 ; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0]
479 ; CHECK-MVEFP-NEXT: bx lr
481 %e = fptrunc <4 x float> %val to <4 x half>
482 store <4 x half> %e, ptr %src, align 4
486 define arm_aapcs_vfpcc void @store_trunc_8(ptr %src, <8 x float> %val) {
487 ; CHECK-MVE-LABEL: store_trunc_8:
488 ; CHECK-MVE: @ %bb.0: @ %entry
489 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
490 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1
491 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2
492 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s4
493 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3
494 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s6
495 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s5
496 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7
497 ; CHECK-MVE-NEXT: vstrw.32 q0, [r0]
498 ; CHECK-MVE-NEXT: bx lr
500 ; CHECK-MVEFP-LABEL: store_trunc_8:
501 ; CHECK-MVEFP: @ %bb.0: @ %entry
502 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1
503 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
504 ; CHECK-MVEFP-NEXT: vstrh.32 q1, [r0, #8]
505 ; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0]
506 ; CHECK-MVEFP-NEXT: bx lr
508 %e = fptrunc <8 x float> %val to <8 x half>
509 store <8 x half> %e, ptr %src, align 4
513 define arm_aapcs_vfpcc void @store_trunc_16(ptr %src, <16 x float> %val) {
514 ; CHECK-MVE-LABEL: store_trunc_16:
515 ; CHECK-MVE: @ %bb.0: @ %entry
516 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
517 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1
518 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2
519 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s4
520 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3
521 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s6
522 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s5
523 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7
524 ; CHECK-MVE-NEXT: vstrb.8 q0, [r0], #16
525 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8
526 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s10
527 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s12
528 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s14
529 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s9
530 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s11
531 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s13
532 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s15
533 ; CHECK-MVE-NEXT: vstrw.32 q0, [r0]
534 ; CHECK-MVE-NEXT: bx lr
536 ; CHECK-MVEFP-LABEL: store_trunc_16:
537 ; CHECK-MVEFP: @ %bb.0: @ %entry
538 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3
539 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2
540 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1
541 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
542 ; CHECK-MVEFP-NEXT: vstrh.32 q3, [r0, #24]
543 ; CHECK-MVEFP-NEXT: vstrh.32 q2, [r0, #16]
544 ; CHECK-MVEFP-NEXT: vstrh.32 q1, [r0, #8]
545 ; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0]
546 ; CHECK-MVEFP-NEXT: bx lr
548 %e = fptrunc <16 x float> %val to <16 x half>
549 store <16 x half> %e, ptr %src, align 4
553 define arm_aapcs_vfpcc void @store_shuffletrunc_8(ptr %src, <4 x float> %val1, <4 x float> %val2) {
554 ; CHECK-MVE-LABEL: store_shuffletrunc_8:
555 ; CHECK-MVE: @ %bb.0: @ %entry
556 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
557 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1
558 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2
559 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3
560 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4
561 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5
562 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6
563 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7
564 ; CHECK-MVE-NEXT: vstrw.32 q0, [r0]
565 ; CHECK-MVE-NEXT: bx lr
567 ; CHECK-MVEFP-LABEL: store_shuffletrunc_8:
568 ; CHECK-MVEFP: @ %bb.0: @ %entry
569 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
570 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1
571 ; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
572 ; CHECK-MVEFP-NEXT: bx lr
574 %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
575 %out = fptrunc <8 x float> %strided.vec to <8 x half>
576 store <8 x half> %out, ptr %src, align 4
580 define arm_aapcs_vfpcc void @store_shuffletrunc_16(ptr %src, <8 x float> %val1, <8 x float> %val2) {
581 ; CHECK-MVE-LABEL: store_shuffletrunc_16:
582 ; CHECK-MVE: @ %bb.0: @ %entry
583 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
584 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1
585 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2
586 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3
587 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8
588 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9
589 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10
590 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11
591 ; CHECK-MVE-NEXT: vstrb.8 q0, [r0], #16
592 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4
593 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5
594 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6
595 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7
596 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s12
597 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s13
598 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s14
599 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s15
600 ; CHECK-MVE-NEXT: vstrw.32 q0, [r0]
601 ; CHECK-MVE-NEXT: bx lr
603 ; CHECK-MVEFP-LABEL: store_shuffletrunc_16:
604 ; CHECK-MVEFP: @ %bb.0: @ %entry
605 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1
606 ; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0
607 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3
608 ; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2
609 ; CHECK-MVEFP-NEXT: vstrw.32 q1, [r0, #16]
610 ; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
611 ; CHECK-MVEFP-NEXT: bx lr
613 %strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
614 %out = fptrunc <16 x float> %strided.vec to <16 x half>
615 store <16 x half> %out, ptr %src, align 4