1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
4 define void @to_4(ptr nocapture readonly %x, ptr noalias nocapture %y) {
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: mov.w lr, #256
10 ; CHECK-NEXT: movw r2, #26214
11 ; CHECK-NEXT: movt r2, #16390
12 ; CHECK-NEXT: .LBB0_1: @ %vector.body
13 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
14 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
15 ; CHECK-NEXT: vmul.f32 q0, q0, r2
16 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
17 ; CHECK-NEXT: vstrh.32 q0, [r1], #8
18 ; CHECK-NEXT: le lr, .LBB0_1
19 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
20 ; CHECK-NEXT: pop {r7, pc}
24 vector.body: ; preds = %vector.body, %entry
25 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
26 %0 = getelementptr inbounds float, ptr %x, i32 %index
27 %wide.load = load <4 x float>, ptr %0, align 4
28 %1 = fmul <4 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
29 %2 = fptrunc <4 x float> %1 to <4 x half>
30 %3 = getelementptr inbounds half, ptr %y, i32 %index
31 store <4 x half> %2, ptr %3, align 2
32 %index.next = add i32 %index, 4
33 %4 = icmp eq i32 %index.next, 1024
34 br i1 %4, label %for.cond.cleanup, label %vector.body
36 for.cond.cleanup: ; preds = %vector.body
40 define void @to_8(ptr nocapture readonly %x, ptr noalias nocapture %y) {
42 ; CHECK: @ %bb.0: @ %entry
43 ; CHECK-NEXT: .save {r7, lr}
44 ; CHECK-NEXT: push {r7, lr}
45 ; CHECK-NEXT: mov.w lr, #128
46 ; CHECK-NEXT: movw r2, #26214
47 ; CHECK-NEXT: movt r2, #16390
48 ; CHECK-NEXT: .LBB1_1: @ %vector.body
49 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
50 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
51 ; CHECK-NEXT: vmul.f32 q0, q0, r2
52 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
53 ; CHECK-NEXT: vstrh.32 q0, [r1, #8]
54 ; CHECK-NEXT: vldrw.u32 q0, [r0], #32
55 ; CHECK-NEXT: vmul.f32 q0, q0, r2
56 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
57 ; CHECK-NEXT: vstrh.32 q0, [r1], #16
58 ; CHECK-NEXT: le lr, .LBB1_1
59 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
60 ; CHECK-NEXT: pop {r7, pc}
64 vector.body: ; preds = %vector.body, %entry
65 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
66 %0 = getelementptr inbounds float, ptr %x, i32 %index
67 %wide.load = load <8 x float>, ptr %0, align 4
68 %1 = fmul <8 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
69 %2 = fptrunc <8 x float> %1 to <8 x half>
70 %3 = getelementptr inbounds half, ptr %y, i32 %index
71 store <8 x half> %2, ptr %3, align 2
72 %index.next = add i32 %index, 8
73 %4 = icmp eq i32 %index.next, 1024
74 br i1 %4, label %for.cond.cleanup, label %vector.body
76 for.cond.cleanup: ; preds = %vector.body
80 define void @to_16(ptr nocapture readonly %x, ptr noalias nocapture %y) {
82 ; CHECK: @ %bb.0: @ %entry
83 ; CHECK-NEXT: .save {r7, lr}
84 ; CHECK-NEXT: push {r7, lr}
85 ; CHECK-NEXT: mov.w lr, #64
86 ; CHECK-NEXT: movw r2, #26214
87 ; CHECK-NEXT: movt r2, #16390
88 ; CHECK-NEXT: .LBB2_1: @ %vector.body
89 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
90 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
91 ; CHECK-NEXT: vmul.f32 q0, q0, r2
92 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
93 ; CHECK-NEXT: vstrh.32 q0, [r1, #24]
94 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
95 ; CHECK-NEXT: vmul.f32 q0, q0, r2
96 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
97 ; CHECK-NEXT: vstrh.32 q0, [r1, #16]
98 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
99 ; CHECK-NEXT: vmul.f32 q0, q0, r2
100 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
101 ; CHECK-NEXT: vstrh.32 q0, [r1, #8]
102 ; CHECK-NEXT: vldrw.u32 q0, [r0], #64
103 ; CHECK-NEXT: vmul.f32 q0, q0, r2
104 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
105 ; CHECK-NEXT: vstrh.32 q0, [r1], #32
106 ; CHECK-NEXT: le lr, .LBB2_1
107 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
108 ; CHECK-NEXT: pop {r7, pc}
110 br label %vector.body
112 vector.body: ; preds = %vector.body, %entry
113 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
114 %0 = getelementptr inbounds float, ptr %x, i32 %index
115 %wide.load = load <16 x float>, ptr %0, align 4
116 %1 = fmul <16 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
117 %2 = fptrunc <16 x float> %1 to <16 x half>
118 %3 = getelementptr inbounds half, ptr %y, i32 %index
119 store <16 x half> %2, ptr %3, align 2
120 %index.next = add i32 %index, 16
121 %4 = icmp eq i32 %index.next, 1024
122 br i1 %4, label %for.cond.cleanup, label %vector.body
124 for.cond.cleanup: ; preds = %vector.body
128 define void @from_4(ptr nocapture readonly %x, ptr noalias nocapture %y) {
129 ; CHECK-LABEL: from_4:
130 ; CHECK: @ %bb.0: @ %entry
131 ; CHECK-NEXT: .save {r7, lr}
132 ; CHECK-NEXT: push {r7, lr}
133 ; CHECK-NEXT: mov.w lr, #256
134 ; CHECK-NEXT: movw r2, #26214
135 ; CHECK-NEXT: movt r2, #16390
136 ; CHECK-NEXT: .LBB3_1: @ %vector.body
137 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
138 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8
139 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
140 ; CHECK-NEXT: vmul.f32 q0, q0, r2
141 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
142 ; CHECK-NEXT: le lr, .LBB3_1
143 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
144 ; CHECK-NEXT: pop {r7, pc}
146 br label %vector.body
148 vector.body: ; preds = %vector.body, %entry
149 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
150 %0 = getelementptr inbounds half, ptr %x, i32 %index
151 %wide.load = load <4 x half>, ptr %0, align 2
152 %1 = fpext <4 x half> %wide.load to <4 x float>
153 %2 = fmul <4 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
154 %3 = getelementptr inbounds float, ptr %y, i32 %index
155 store <4 x float> %2, ptr %3, align 4
156 %index.next = add i32 %index, 4
157 %4 = icmp eq i32 %index.next, 1024
158 br i1 %4, label %for.cond.cleanup, label %vector.body
160 for.cond.cleanup: ; preds = %vector.body
164 define void @from_8(ptr nocapture readonly %x, ptr noalias nocapture %y) {
165 ; CHECK-LABEL: from_8:
166 ; CHECK: @ %bb.0: @ %entry
167 ; CHECK-NEXT: .save {r7, lr}
168 ; CHECK-NEXT: push {r7, lr}
169 ; CHECK-NEXT: mov.w lr, #128
170 ; CHECK-NEXT: movw r2, #26214
171 ; CHECK-NEXT: movt r2, #16390
172 ; CHECK-NEXT: .LBB4_1: @ %vector.body
173 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
174 ; CHECK-NEXT: vldrh.u32 q0, [r0], #16
175 ; CHECK-NEXT: vldrh.u32 q1, [r0, #-8]
176 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
177 ; CHECK-NEXT: vmul.f32 q0, q0, r2
178 ; CHECK-NEXT: vcvtb.f32.f16 q1, q1
179 ; CHECK-NEXT: vmul.f32 q1, q1, r2
180 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
181 ; CHECK-NEXT: vstrw.32 q0, [r1], #32
182 ; CHECK-NEXT: le lr, .LBB4_1
183 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
184 ; CHECK-NEXT: pop {r7, pc}
186 br label %vector.body
188 vector.body: ; preds = %vector.body, %entry
189 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
190 %0 = getelementptr inbounds half, ptr %x, i32 %index
191 %wide.load = load <8 x half>, ptr %0, align 2
192 %1 = fpext <8 x half> %wide.load to <8 x float>
193 %2 = fmul <8 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
194 %3 = getelementptr inbounds float, ptr %y, i32 %index
195 store <8 x float> %2, ptr %3, align 4
196 %index.next = add i32 %index, 8
197 %4 = icmp eq i32 %index.next, 1024
198 br i1 %4, label %for.cond.cleanup, label %vector.body
200 for.cond.cleanup: ; preds = %vector.body
204 define void @from_16(ptr nocapture readonly %x, ptr noalias nocapture %y) {
205 ; CHECK-LABEL: from_16:
206 ; CHECK: @ %bb.0: @ %entry
207 ; CHECK-NEXT: .save {r7, lr}
208 ; CHECK-NEXT: push {r7, lr}
209 ; CHECK-NEXT: mov.w lr, #64
210 ; CHECK-NEXT: movw r2, #26214
211 ; CHECK-NEXT: movt r2, #16390
212 ; CHECK-NEXT: .LBB5_1: @ %vector.body
213 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
214 ; CHECK-NEXT: vldrh.u32 q0, [r0], #32
215 ; CHECK-NEXT: vldrh.u32 q1, [r0, #-24]
216 ; CHECK-NEXT: vldrh.u32 q2, [r0, #-16]
217 ; CHECK-NEXT: vldrh.u32 q3, [r0, #-8]
218 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
219 ; CHECK-NEXT: vcvtb.f32.f16 q1, q1
220 ; CHECK-NEXT: vcvtb.f32.f16 q2, q2
221 ; CHECK-NEXT: vcvtb.f32.f16 q3, q3
222 ; CHECK-NEXT: vmul.f32 q2, q2, r2
223 ; CHECK-NEXT: vmul.f32 q3, q3, r2
224 ; CHECK-NEXT: vmul.f32 q1, q1, r2
225 ; CHECK-NEXT: vmul.f32 q0, q0, r2
226 ; CHECK-NEXT: vstrw.32 q3, [r1, #48]
227 ; CHECK-NEXT: vstrw.32 q2, [r1, #32]
228 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
229 ; CHECK-NEXT: vstrw.32 q0, [r1], #64
230 ; CHECK-NEXT: le lr, .LBB5_1
231 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
232 ; CHECK-NEXT: pop {r7, pc}
234 br label %vector.body
236 vector.body: ; preds = %vector.body, %entry
237 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
238 %0 = getelementptr inbounds half, ptr %x, i32 %index
239 %wide.load = load <16 x half>, ptr %0, align 2
240 %1 = fpext <16 x half> %wide.load to <16 x float>
241 %2 = fmul <16 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
242 %3 = getelementptr inbounds float, ptr %y, i32 %index
243 store <16 x float> %2, ptr %3, align 4
244 %index.next = add i32 %index, 16
245 %4 = icmp eq i32 %index.next, 1024
246 br i1 %4, label %for.cond.cleanup, label %vector.body
248 for.cond.cleanup: ; preds = %vector.body
252 define void @both_4(ptr nocapture readonly %x, ptr noalias nocapture %y) {
253 ; CHECK-LABEL: both_4:
254 ; CHECK: @ %bb.0: @ %entry
255 ; CHECK-NEXT: .save {r7, lr}
256 ; CHECK-NEXT: push {r7, lr}
257 ; CHECK-NEXT: mov.w lr, #256
258 ; CHECK-NEXT: movw r2, #26214
259 ; CHECK-NEXT: movt r2, #16390
260 ; CHECK-NEXT: .LBB6_1: @ %vector.body
261 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
262 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8
263 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
264 ; CHECK-NEXT: vmul.f32 q0, q0, r2
265 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
266 ; CHECK-NEXT: vstrh.32 q0, [r1], #8
267 ; CHECK-NEXT: le lr, .LBB6_1
268 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
269 ; CHECK-NEXT: pop {r7, pc}
271 br label %vector.body
273 vector.body: ; preds = %vector.body, %entry
274 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
275 %0 = getelementptr inbounds half, ptr %x, i32 %index
276 %wide.load = load <4 x half>, ptr %0, align 2
277 %1 = fpext <4 x half> %wide.load to <4 x float>
278 %2 = fmul <4 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
279 %3 = fptrunc <4 x float> %2 to <4 x half>
280 %4 = getelementptr inbounds half, ptr %y, i32 %index
281 store <4 x half> %3, ptr %4, align 2
282 %index.next = add i32 %index, 4
283 %5 = icmp eq i32 %index.next, 1024
284 br i1 %5, label %for.cond.cleanup, label %vector.body
286 for.cond.cleanup: ; preds = %vector.body
290 define void @both_8(ptr nocapture readonly %x, ptr noalias nocapture %y) {
291 ; CHECK-LABEL: both_8:
292 ; CHECK: @ %bb.0: @ %entry
293 ; CHECK-NEXT: .save {r7, lr}
294 ; CHECK-NEXT: push {r7, lr}
295 ; CHECK-NEXT: mov.w lr, #128
296 ; CHECK-NEXT: movw r2, #26214
297 ; CHECK-NEXT: movt r2, #16390
298 ; CHECK-NEXT: .LBB7_1: @ %vector.body
299 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
300 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
301 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
302 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
303 ; CHECK-NEXT: vmul.f32 q1, q1, r2
304 ; CHECK-NEXT: vmul.f32 q0, q0, r2
305 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
306 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
307 ; CHECK-NEXT: vstrb.8 q1, [r1], #16
308 ; CHECK-NEXT: le lr, .LBB7_1
309 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
310 ; CHECK-NEXT: pop {r7, pc}
312 br label %vector.body
314 vector.body: ; preds = %vector.body, %entry
315 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
316 %0 = getelementptr inbounds half, ptr %x, i32 %index
317 %wide.load = load <8 x half>, ptr %0, align 2
318 %1 = fpext <8 x half> %wide.load to <8 x float>
319 %2 = fmul <8 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
320 %3 = fptrunc <8 x float> %2 to <8 x half>
321 %4 = getelementptr inbounds half, ptr %y, i32 %index
322 store <8 x half> %3, ptr %4, align 2
323 %index.next = add i32 %index, 8
324 %5 = icmp eq i32 %index.next, 1024
325 br i1 %5, label %for.cond.cleanup, label %vector.body
327 for.cond.cleanup: ; preds = %vector.body
331 define void @both_16(ptr nocapture readonly %x, ptr noalias nocapture %y) {
332 ; CHECK-LABEL: both_16:
333 ; CHECK: @ %bb.0: @ %entry
334 ; CHECK-NEXT: .save {r7, lr}
335 ; CHECK-NEXT: push {r7, lr}
336 ; CHECK-NEXT: mov.w lr, #64
337 ; CHECK-NEXT: movw r2, #26214
338 ; CHECK-NEXT: movt r2, #16390
339 ; CHECK-NEXT: .LBB8_1: @ %vector.body
340 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
341 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
342 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
343 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
344 ; CHECK-NEXT: vmul.f32 q1, q1, r2
345 ; CHECK-NEXT: vmul.f32 q0, q0, r2
346 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
347 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
348 ; CHECK-NEXT: vldrh.u16 q0, [r0], #32
349 ; CHECK-NEXT: vstrh.16 q1, [r1, #16]
350 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
351 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
352 ; CHECK-NEXT: vmul.f32 q1, q1, r2
353 ; CHECK-NEXT: vmul.f32 q0, q0, r2
354 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
355 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
356 ; CHECK-NEXT: vstrh.16 q1, [r1], #32
357 ; CHECK-NEXT: le lr, .LBB8_1
358 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
359 ; CHECK-NEXT: pop {r7, pc}
361 br label %vector.body
363 vector.body: ; preds = %vector.body, %entry
364 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
365 %0 = getelementptr inbounds half, ptr %x, i32 %index
366 %wide.load = load <16 x half>, ptr %0, align 2
367 %1 = fpext <16 x half> %wide.load to <16 x float>
368 %2 = fmul <16 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
369 %3 = fptrunc <16 x float> %2 to <16 x half>
370 %4 = getelementptr inbounds half, ptr %y, i32 %index
371 store <16 x half> %3, ptr %4, align 2
372 %index.next = add i32 %index, 16
373 %5 = icmp eq i32 %index.next, 1024
374 br i1 %5, label %for.cond.cleanup, label %vector.body
376 for.cond.cleanup: ; preds = %vector.body
380 define void @both_8_I(ptr nocapture readonly %x, ptr noalias nocapture %y) {
381 ; CHECK-LABEL: both_8_I:
382 ; CHECK: @ %bb.0: @ %entry
383 ; CHECK-NEXT: .save {r7, lr}
384 ; CHECK-NEXT: push {r7, lr}
385 ; CHECK-NEXT: mov.w lr, #128
386 ; CHECK-NEXT: movw r2, #26214
387 ; CHECK-NEXT: movt r2, #16390
388 ; CHECK-NEXT: .LBB9_1: @ %vector.body
389 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
390 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
391 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
392 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
393 ; CHECK-NEXT: vmul.f32 q1, q1, r2
394 ; CHECK-NEXT: vmul.f32 q0, q0, r2
395 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
396 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
397 ; CHECK-NEXT: vstrb.8 q1, [r1], #16
398 ; CHECK-NEXT: le lr, .LBB9_1
399 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
400 ; CHECK-NEXT: pop {r7, pc}
402 br label %vector.body
404 vector.body: ; preds = %vector.body, %entry
405 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
406 %0 = getelementptr inbounds half, ptr %x, i32 %index
407 %wide.load = load <8 x half>, ptr %0, align 2
408 %1 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
409 %2 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
410 %3 = fpext <4 x half> %1 to <4 x float>
411 %4 = fpext <4 x half> %2 to <4 x float>
412 %5 = fmul <4 x float> %3, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
413 %6 = fmul <4 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
414 %7 = shufflevector <4 x float> %5, <4 x float> %6, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
415 %8 = fptrunc <8 x float> %7 to <8 x half>
416 %9 = getelementptr inbounds half, ptr %y, i32 %index
417 store <8 x half> %8, ptr %9, align 2
418 %index.next = add i32 %index, 8
419 %10 = icmp eq i32 %index.next, 1024
420 br i1 %10, label %for.cond.cleanup, label %vector.body
422 for.cond.cleanup: ; preds = %vector.body
426 define void @both_16_I(ptr nocapture readonly %x, ptr noalias nocapture %y) {
427 ; CHECK-LABEL: both_16_I:
428 ; CHECK: @ %bb.0: @ %entry
429 ; CHECK-NEXT: .save {r7, lr}
430 ; CHECK-NEXT: push {r7, lr}
431 ; CHECK-NEXT: mov.w lr, #128
432 ; CHECK-NEXT: movw r2, #26214
433 ; CHECK-NEXT: movt r2, #16390
434 ; CHECK-NEXT: .LBB10_1: @ %vector.body
435 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
436 ; CHECK-NEXT: vldrh.u16 q0, [r0]
437 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
438 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
439 ; CHECK-NEXT: vmul.f32 q1, q1, r2
440 ; CHECK-NEXT: vmul.f32 q0, q0, r2
441 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
442 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
443 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]!
444 ; CHECK-NEXT: vstrh.16 q1, [r1]
445 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
446 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
447 ; CHECK-NEXT: vmul.f32 q1, q1, r2
448 ; CHECK-NEXT: vmul.f32 q0, q0, r2
449 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
450 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
451 ; CHECK-NEXT: vstrb.8 q1, [r1, #16]!
452 ; CHECK-NEXT: le lr, .LBB10_1
453 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
454 ; CHECK-NEXT: pop {r7, pc}
456 br label %vector.body
458 vector.body: ; preds = %vector.body, %entry
459 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
460 %0 = getelementptr inbounds half, ptr %x, i32 %index
461 %wide.load = load <16 x half>, ptr %0, align 2
462 %1 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
463 %2 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
464 %3 = fpext <8 x half> %1 to <8 x float>
465 %4 = fpext <8 x half> %2 to <8 x float>
466 %5 = fmul <8 x float> %3, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
467 %6 = fmul <8 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
468 %7 = shufflevector <8 x float> %5, <8 x float> %6, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
469 %8 = fptrunc <16 x float> %7 to <16 x half>
470 %9 = getelementptr inbounds half, ptr %y, i32 %index
471 store <16 x half> %8, ptr %9, align 2
472 %index.next = add i32 %index, 8
473 %10 = icmp eq i32 %index.next, 1024
474 br i1 %10, label %for.cond.cleanup, label %vector.body
476 for.cond.cleanup: ; preds = %vector.body