1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
4 define void @to_4(float* nocapture readonly %x, half* noalias nocapture %y) {
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: mov.w lr, #256
10 ; CHECK-NEXT: movw r2, #26214
11 ; CHECK-NEXT: movt r2, #16390
12 ; CHECK-NEXT: .LBB0_1: @ %vector.body
13 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
14 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
15 ; CHECK-NEXT: vmul.f32 q0, q0, r2
16 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
17 ; CHECK-NEXT: vstrh.32 q0, [r1], #8
18 ; CHECK-NEXT: le lr, .LBB0_1
19 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
20 ; CHECK-NEXT: pop {r7, pc}
24 vector.body: ; preds = %vector.body, %entry
25 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
26 %0 = getelementptr inbounds float, float* %x, i32 %index
27 %1 = bitcast float* %0 to <4 x float>*
28 %wide.load = load <4 x float>, <4 x float>* %1, align 4
29 %2 = fmul <4 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
30 %3 = fptrunc <4 x float> %2 to <4 x half>
31 %4 = getelementptr inbounds half, half* %y, i32 %index
32 %5 = bitcast half* %4 to <4 x half>*
33 store <4 x half> %3, <4 x half>* %5, align 2
34 %index.next = add i32 %index, 4
35 %6 = icmp eq i32 %index.next, 1024
36 br i1 %6, label %for.cond.cleanup, label %vector.body
38 for.cond.cleanup: ; preds = %vector.body
42 define void @to_8(float* nocapture readonly %x, half* noalias nocapture %y) {
44 ; CHECK: @ %bb.0: @ %entry
45 ; CHECK-NEXT: .save {r7, lr}
46 ; CHECK-NEXT: push {r7, lr}
47 ; CHECK-NEXT: mov.w lr, #128
48 ; CHECK-NEXT: movw r2, #26214
49 ; CHECK-NEXT: movt r2, #16390
50 ; CHECK-NEXT: .LBB1_1: @ %vector.body
51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
52 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
53 ; CHECK-NEXT: vmul.f32 q0, q0, r2
54 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
55 ; CHECK-NEXT: vstrh.32 q0, [r1, #8]
56 ; CHECK-NEXT: vldrw.u32 q0, [r0], #32
57 ; CHECK-NEXT: vmul.f32 q0, q0, r2
58 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
59 ; CHECK-NEXT: vstrh.32 q0, [r1], #16
60 ; CHECK-NEXT: le lr, .LBB1_1
61 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
62 ; CHECK-NEXT: pop {r7, pc}
66 vector.body: ; preds = %vector.body, %entry
67 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
68 %0 = getelementptr inbounds float, float* %x, i32 %index
69 %1 = bitcast float* %0 to <8 x float>*
70 %wide.load = load <8 x float>, <8 x float>* %1, align 4
71 %2 = fmul <8 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
72 %3 = fptrunc <8 x float> %2 to <8 x half>
73 %4 = getelementptr inbounds half, half* %y, i32 %index
74 %5 = bitcast half* %4 to <8 x half>*
75 store <8 x half> %3, <8 x half>* %5, align 2
76 %index.next = add i32 %index, 8
77 %6 = icmp eq i32 %index.next, 1024
78 br i1 %6, label %for.cond.cleanup, label %vector.body
80 for.cond.cleanup: ; preds = %vector.body
84 define void @to_16(float* nocapture readonly %x, half* noalias nocapture %y) {
86 ; CHECK: @ %bb.0: @ %entry
87 ; CHECK-NEXT: .save {r7, lr}
88 ; CHECK-NEXT: push {r7, lr}
89 ; CHECK-NEXT: mov.w lr, #64
90 ; CHECK-NEXT: movw r2, #26214
91 ; CHECK-NEXT: movt r2, #16390
92 ; CHECK-NEXT: .LBB2_1: @ %vector.body
93 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
94 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
95 ; CHECK-NEXT: vmul.f32 q0, q0, r2
96 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
97 ; CHECK-NEXT: vstrh.32 q0, [r1, #24]
98 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
99 ; CHECK-NEXT: vmul.f32 q0, q0, r2
100 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
101 ; CHECK-NEXT: vstrh.32 q0, [r1, #16]
102 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
103 ; CHECK-NEXT: vmul.f32 q0, q0, r2
104 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
105 ; CHECK-NEXT: vstrh.32 q0, [r1, #8]
106 ; CHECK-NEXT: vldrw.u32 q0, [r0], #64
107 ; CHECK-NEXT: vmul.f32 q0, q0, r2
108 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
109 ; CHECK-NEXT: vstrh.32 q0, [r1], #32
110 ; CHECK-NEXT: le lr, .LBB2_1
111 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
112 ; CHECK-NEXT: pop {r7, pc}
114 br label %vector.body
116 vector.body: ; preds = %vector.body, %entry
117 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
118 %0 = getelementptr inbounds float, float* %x, i32 %index
119 %1 = bitcast float* %0 to <16 x float>*
120 %wide.load = load <16 x float>, <16 x float>* %1, align 4
121 %2 = fmul <16 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
122 %3 = fptrunc <16 x float> %2 to <16 x half>
123 %4 = getelementptr inbounds half, half* %y, i32 %index
124 %5 = bitcast half* %4 to <16 x half>*
125 store <16 x half> %3, <16 x half>* %5, align 2
126 %index.next = add i32 %index, 16
127 %6 = icmp eq i32 %index.next, 1024
128 br i1 %6, label %for.cond.cleanup, label %vector.body
130 for.cond.cleanup: ; preds = %vector.body
134 define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) {
135 ; CHECK-LABEL: from_4:
136 ; CHECK: @ %bb.0: @ %entry
137 ; CHECK-NEXT: .save {r7, lr}
138 ; CHECK-NEXT: push {r7, lr}
139 ; CHECK-NEXT: mov.w lr, #256
140 ; CHECK-NEXT: movw r2, #26214
141 ; CHECK-NEXT: movt r2, #16390
142 ; CHECK-NEXT: .LBB3_1: @ %vector.body
143 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
144 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8
145 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
146 ; CHECK-NEXT: vmul.f32 q0, q0, r2
147 ; CHECK-NEXT: vstrb.8 q0, [r1], #16
148 ; CHECK-NEXT: le lr, .LBB3_1
149 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
150 ; CHECK-NEXT: pop {r7, pc}
152 br label %vector.body
154 vector.body: ; preds = %vector.body, %entry
155 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
156 %0 = getelementptr inbounds half, half* %x, i32 %index
157 %1 = bitcast half* %0 to <4 x half>*
158 %wide.load = load <4 x half>, <4 x half>* %1, align 2
159 %2 = fpext <4 x half> %wide.load to <4 x float>
160 %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
161 %4 = getelementptr inbounds float, float* %y, i32 %index
162 %5 = bitcast float* %4 to <4 x float>*
163 store <4 x float> %3, <4 x float>* %5, align 4
164 %index.next = add i32 %index, 4
165 %6 = icmp eq i32 %index.next, 1024
166 br i1 %6, label %for.cond.cleanup, label %vector.body
168 for.cond.cleanup: ; preds = %vector.body
172 define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) {
173 ; CHECK-LABEL: from_8:
174 ; CHECK: @ %bb.0: @ %entry
175 ; CHECK-NEXT: .save {r7, lr}
176 ; CHECK-NEXT: push {r7, lr}
177 ; CHECK-NEXT: mov.w lr, #128
178 ; CHECK-NEXT: movw r2, #26214
179 ; CHECK-NEXT: movt r2, #16390
180 ; CHECK-NEXT: .LBB4_1: @ %vector.body
181 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
182 ; CHECK-NEXT: vldrh.u32 q0, [r0], #16
183 ; CHECK-NEXT: vldrh.u32 q1, [r0, #-8]
184 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
185 ; CHECK-NEXT: vmul.f32 q0, q0, r2
186 ; CHECK-NEXT: vcvtb.f32.f16 q1, q1
187 ; CHECK-NEXT: vmul.f32 q1, q1, r2
188 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
189 ; CHECK-NEXT: vstrw.32 q0, [r1], #32
190 ; CHECK-NEXT: le lr, .LBB4_1
191 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
192 ; CHECK-NEXT: pop {r7, pc}
194 br label %vector.body
196 vector.body: ; preds = %vector.body, %entry
197 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
198 %0 = getelementptr inbounds half, half* %x, i32 %index
199 %1 = bitcast half* %0 to <8 x half>*
200 %wide.load = load <8 x half>, <8 x half>* %1, align 2
201 %2 = fpext <8 x half> %wide.load to <8 x float>
202 %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
203 %4 = getelementptr inbounds float, float* %y, i32 %index
204 %5 = bitcast float* %4 to <8 x float>*
205 store <8 x float> %3, <8 x float>* %5, align 4
206 %index.next = add i32 %index, 8
207 %6 = icmp eq i32 %index.next, 1024
208 br i1 %6, label %for.cond.cleanup, label %vector.body
210 for.cond.cleanup: ; preds = %vector.body
214 define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) {
215 ; CHECK-LABEL: from_16:
216 ; CHECK: @ %bb.0: @ %entry
217 ; CHECK-NEXT: .save {r7, lr}
218 ; CHECK-NEXT: push {r7, lr}
219 ; CHECK-NEXT: mov.w lr, #64
220 ; CHECK-NEXT: movw r2, #26214
221 ; CHECK-NEXT: movt r2, #16390
222 ; CHECK-NEXT: .LBB5_1: @ %vector.body
223 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
224 ; CHECK-NEXT: vldrh.u32 q0, [r0], #32
225 ; CHECK-NEXT: vldrh.u32 q1, [r0, #-24]
226 ; CHECK-NEXT: vldrh.u32 q2, [r0, #-16]
227 ; CHECK-NEXT: vldrh.u32 q3, [r0, #-8]
228 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
229 ; CHECK-NEXT: vcvtb.f32.f16 q1, q1
230 ; CHECK-NEXT: vcvtb.f32.f16 q2, q2
231 ; CHECK-NEXT: vcvtb.f32.f16 q3, q3
232 ; CHECK-NEXT: vmul.f32 q2, q2, r2
233 ; CHECK-NEXT: vmul.f32 q3, q3, r2
234 ; CHECK-NEXT: vmul.f32 q1, q1, r2
235 ; CHECK-NEXT: vmul.f32 q0, q0, r2
236 ; CHECK-NEXT: vstrw.32 q3, [r1, #48]
237 ; CHECK-NEXT: vstrw.32 q2, [r1, #32]
238 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
239 ; CHECK-NEXT: vstrw.32 q0, [r1], #64
240 ; CHECK-NEXT: le lr, .LBB5_1
241 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
242 ; CHECK-NEXT: pop {r7, pc}
244 br label %vector.body
246 vector.body: ; preds = %vector.body, %entry
247 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
248 %0 = getelementptr inbounds half, half* %x, i32 %index
249 %1 = bitcast half* %0 to <16 x half>*
250 %wide.load = load <16 x half>, <16 x half>* %1, align 2
251 %2 = fpext <16 x half> %wide.load to <16 x float>
252 %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
253 %4 = getelementptr inbounds float, float* %y, i32 %index
254 %5 = bitcast float* %4 to <16 x float>*
255 store <16 x float> %3, <16 x float>* %5, align 4
256 %index.next = add i32 %index, 16
257 %6 = icmp eq i32 %index.next, 1024
258 br i1 %6, label %for.cond.cleanup, label %vector.body
260 for.cond.cleanup: ; preds = %vector.body
264 define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) {
265 ; CHECK-LABEL: both_4:
266 ; CHECK: @ %bb.0: @ %entry
267 ; CHECK-NEXT: .save {r7, lr}
268 ; CHECK-NEXT: push {r7, lr}
269 ; CHECK-NEXT: mov.w lr, #256
270 ; CHECK-NEXT: movw r2, #26214
271 ; CHECK-NEXT: movt r2, #16390
272 ; CHECK-NEXT: .LBB6_1: @ %vector.body
273 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
274 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8
275 ; CHECK-NEXT: vcvtb.f32.f16 q0, q0
276 ; CHECK-NEXT: vmul.f32 q0, q0, r2
277 ; CHECK-NEXT: vcvtb.f16.f32 q0, q0
278 ; CHECK-NEXT: vstrh.32 q0, [r1], #8
279 ; CHECK-NEXT: le lr, .LBB6_1
280 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
281 ; CHECK-NEXT: pop {r7, pc}
283 br label %vector.body
285 vector.body: ; preds = %vector.body, %entry
286 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
287 %0 = getelementptr inbounds half, half* %x, i32 %index
288 %1 = bitcast half* %0 to <4 x half>*
289 %wide.load = load <4 x half>, <4 x half>* %1, align 2
290 %2 = fpext <4 x half> %wide.load to <4 x float>
291 %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
292 %4 = fptrunc <4 x float> %3 to <4 x half>
293 %5 = getelementptr inbounds half, half* %y, i32 %index
294 %6 = bitcast half* %5 to <4 x half>*
295 store <4 x half> %4, <4 x half>* %6, align 2
296 %index.next = add i32 %index, 4
297 %7 = icmp eq i32 %index.next, 1024
298 br i1 %7, label %for.cond.cleanup, label %vector.body
300 for.cond.cleanup: ; preds = %vector.body
304 define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
305 ; CHECK-LABEL: both_8:
306 ; CHECK: @ %bb.0: @ %entry
307 ; CHECK-NEXT: .save {r7, lr}
308 ; CHECK-NEXT: push {r7, lr}
309 ; CHECK-NEXT: mov.w lr, #128
310 ; CHECK-NEXT: movw r2, #26214
311 ; CHECK-NEXT: movt r2, #16390
312 ; CHECK-NEXT: .LBB7_1: @ %vector.body
313 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
314 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
315 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
316 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
317 ; CHECK-NEXT: vmul.f32 q1, q1, r2
318 ; CHECK-NEXT: vmul.f32 q0, q0, r2
319 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
320 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
321 ; CHECK-NEXT: vstrb.8 q1, [r1], #16
322 ; CHECK-NEXT: le lr, .LBB7_1
323 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
324 ; CHECK-NEXT: pop {r7, pc}
326 br label %vector.body
328 vector.body: ; preds = %vector.body, %entry
329 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
330 %0 = getelementptr inbounds half, half* %x, i32 %index
331 %1 = bitcast half* %0 to <8 x half>*
332 %wide.load = load <8 x half>, <8 x half>* %1, align 2
333 %2 = fpext <8 x half> %wide.load to <8 x float>
334 %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
335 %4 = fptrunc <8 x float> %3 to <8 x half>
336 %5 = getelementptr inbounds half, half* %y, i32 %index
337 %6 = bitcast half* %5 to <8 x half>*
338 store <8 x half> %4, <8 x half>* %6, align 2
339 %index.next = add i32 %index, 8
340 %7 = icmp eq i32 %index.next, 1024
341 br i1 %7, label %for.cond.cleanup, label %vector.body
343 for.cond.cleanup: ; preds = %vector.body
347 define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
348 ; CHECK-LABEL: both_16:
349 ; CHECK: @ %bb.0: @ %entry
350 ; CHECK-NEXT: .save {r7, lr}
351 ; CHECK-NEXT: push {r7, lr}
352 ; CHECK-NEXT: mov.w lr, #64
353 ; CHECK-NEXT: movw r2, #26214
354 ; CHECK-NEXT: movt r2, #16390
355 ; CHECK-NEXT: .LBB8_1: @ %vector.body
356 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
357 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
358 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
359 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
360 ; CHECK-NEXT: vmul.f32 q1, q1, r2
361 ; CHECK-NEXT: vmul.f32 q0, q0, r2
362 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
363 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
364 ; CHECK-NEXT: vldrh.u16 q0, [r0], #32
365 ; CHECK-NEXT: vstrh.16 q1, [r1, #16]
366 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
367 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
368 ; CHECK-NEXT: vmul.f32 q1, q1, r2
369 ; CHECK-NEXT: vmul.f32 q0, q0, r2
370 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
371 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
372 ; CHECK-NEXT: vstrh.16 q1, [r1], #32
373 ; CHECK-NEXT: le lr, .LBB8_1
374 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
375 ; CHECK-NEXT: pop {r7, pc}
377 br label %vector.body
379 vector.body: ; preds = %vector.body, %entry
380 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
381 %0 = getelementptr inbounds half, half* %x, i32 %index
382 %1 = bitcast half* %0 to <16 x half>*
383 %wide.load = load <16 x half>, <16 x half>* %1, align 2
384 %2 = fpext <16 x half> %wide.load to <16 x float>
385 %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
386 %4 = fptrunc <16 x float> %3 to <16 x half>
387 %5 = getelementptr inbounds half, half* %y, i32 %index
388 %6 = bitcast half* %5 to <16 x half>*
389 store <16 x half> %4, <16 x half>* %6, align 2
390 %index.next = add i32 %index, 16
391 %7 = icmp eq i32 %index.next, 1024
392 br i1 %7, label %for.cond.cleanup, label %vector.body
394 for.cond.cleanup: ; preds = %vector.body
398 define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) {
399 ; CHECK-LABEL: both_8_I:
400 ; CHECK: @ %bb.0: @ %entry
401 ; CHECK-NEXT: .save {r7, lr}
402 ; CHECK-NEXT: push {r7, lr}
403 ; CHECK-NEXT: mov.w lr, #128
404 ; CHECK-NEXT: movw r2, #26214
405 ; CHECK-NEXT: movt r2, #16390
406 ; CHECK-NEXT: .LBB9_1: @ %vector.body
407 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
408 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
409 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
410 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
411 ; CHECK-NEXT: vmul.f32 q1, q1, r2
412 ; CHECK-NEXT: vmul.f32 q0, q0, r2
413 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
414 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
415 ; CHECK-NEXT: vstrb.8 q1, [r1], #16
416 ; CHECK-NEXT: le lr, .LBB9_1
417 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
418 ; CHECK-NEXT: pop {r7, pc}
420 br label %vector.body
422 vector.body: ; preds = %vector.body, %entry
423 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
424 %0 = getelementptr inbounds half, half* %x, i32 %index
425 %1 = bitcast half* %0 to <8 x half>*
426 %wide.load = load <8 x half>, <8 x half>* %1, align 2
427 %2 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
428 %3 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
429 %4 = fpext <4 x half> %2 to <4 x float>
430 %5 = fpext <4 x half> %3 to <4 x float>
431 %6 = fmul <4 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
432 %7 = fmul <4 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
433 %8 = shufflevector <4 x float> %6, <4 x float> %7, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
434 %9 = fptrunc <8 x float> %8 to <8 x half>
435 %10 = getelementptr inbounds half, half* %y, i32 %index
436 %11 = bitcast half* %10 to <8 x half>*
437 store <8 x half> %9, <8 x half>* %11, align 2
438 %index.next = add i32 %index, 8
439 %12 = icmp eq i32 %index.next, 1024
440 br i1 %12, label %for.cond.cleanup, label %vector.body
442 for.cond.cleanup: ; preds = %vector.body
446 define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y) {
447 ; CHECK-LABEL: both_16_I:
448 ; CHECK: @ %bb.0: @ %entry
449 ; CHECK-NEXT: .save {r7, lr}
450 ; CHECK-NEXT: push {r7, lr}
451 ; CHECK-NEXT: mov.w lr, #128
452 ; CHECK-NEXT: movw r2, #26214
453 ; CHECK-NEXT: movt r2, #16390
454 ; CHECK-NEXT: .LBB10_1: @ %vector.body
455 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
456 ; CHECK-NEXT: vldrh.u16 q0, [r0]
457 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
458 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
459 ; CHECK-NEXT: vmul.f32 q1, q1, r2
460 ; CHECK-NEXT: vmul.f32 q0, q0, r2
461 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
462 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
463 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]!
464 ; CHECK-NEXT: vstrh.16 q1, [r1]
465 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0
466 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0
467 ; CHECK-NEXT: vmul.f32 q1, q1, r2
468 ; CHECK-NEXT: vmul.f32 q0, q0, r2
469 ; CHECK-NEXT: vcvtb.f16.f32 q1, q1
470 ; CHECK-NEXT: vcvtt.f16.f32 q1, q0
471 ; CHECK-NEXT: vstrb.8 q1, [r1, #16]!
472 ; CHECK-NEXT: le lr, .LBB10_1
473 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
474 ; CHECK-NEXT: pop {r7, pc}
476 br label %vector.body
478 vector.body: ; preds = %vector.body, %entry
479 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
480 %0 = getelementptr inbounds half, half* %x, i32 %index
481 %1 = bitcast half* %0 to <16 x half>*
482 %wide.load = load <16 x half>, <16 x half>* %1, align 2
483 %2 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
484 %3 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
485 %4 = fpext <8 x half> %2 to <8 x float>
486 %5 = fpext <8 x half> %3 to <8 x float>
487 %6 = fmul <8 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
488 %7 = fmul <8 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
489 %8 = shufflevector <8 x float> %6, <8 x float> %7, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
490 %9 = fptrunc <16 x float> %8 to <16 x half>
491 %10 = getelementptr inbounds half, half* %y, i32 %index
492 %11 = bitcast half* %10 to <16 x half>*
493 store <16 x half> %9, <16 x half>* %11, align 2
494 %index.next = add i32 %index, 8
495 %12 = icmp eq i32 %index.next, 1024
496 br i1 %12, label %for.cond.cleanup, label %vector.body
498 for.cond.cleanup: ; preds = %vector.body