1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple armeb-eabi -mattr=armv8.2-a,neon,fullfp16 -target-abi=aapcs-gnu -float-abi hard -o - %s | FileCheck %s
4 ;64 bit conversions to v4f16
5 define void @conv_i64_to_v4f16( i64 %val, <4 x half>* %store ) {
6 ; CHECK-LABEL: conv_i64_to_v4f16:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vmov d16, r1, r0
9 ; CHECK-NEXT: vldr d17, [r2]
10 ; CHECK-NEXT: vrev64.16 d16, d16
11 ; CHECK-NEXT: vrev64.16 d17, d17
12 ; CHECK-NEXT: vadd.f16 d16, d16, d17
13 ; CHECK-NEXT: vrev64.16 d16, d16
14 ; CHECK-NEXT: vstr d16, [r2]
17 %v = bitcast i64 %val to <4 x half>
18 %w = load <4 x half>, <4 x half>* %store
19 %a = fadd <4 x half> %v, %w
20 store <4 x half> %a, <4 x half>* %store
24 define void @conv_f64_to_v4f16( double %val, <4 x half>* %store ) {
25 ; CHECK-LABEL: conv_f64_to_v4f16:
26 ; CHECK: @ %bb.0: @ %entry
27 ; CHECK-NEXT: vldr d16, [r0]
28 ; CHECK-NEXT: vrev64.16 d17, d0
29 ; CHECK-NEXT: vrev64.16 d16, d16
30 ; CHECK-NEXT: vadd.f16 d16, d17, d16
31 ; CHECK-NEXT: vrev64.16 d16, d16
32 ; CHECK-NEXT: vstr d16, [r0]
35 %v = bitcast double %val to <4 x half>
36 %w = load <4 x half>, <4 x half>* %store
37 %a = fadd <4 x half> %v, %w
38 store <4 x half> %a, <4 x half>* %store
42 define void @conv_v2f32_to_v4f16( <2 x float> %a, <4 x half>* %store ) {
43 ; CHECK-LABEL: conv_v2f32_to_v4f16:
44 ; CHECK: @ %bb.0: @ %entry
45 ; CHECK-NEXT: vldr d16, .LCPI2_0
46 ; CHECK-NEXT: vrev64.32 d17, d0
47 ; CHECK-NEXT: vrev64.32 d16, d16
48 ; CHECK-NEXT: vadd.f32 d16, d17, d16
49 ; CHECK-NEXT: vldr d17, [r0]
50 ; CHECK-NEXT: vrev64.16 d17, d17
51 ; CHECK-NEXT: vrev32.16 d16, d16
52 ; CHECK-NEXT: vadd.f16 d16, d16, d17
53 ; CHECK-NEXT: vrev64.16 d16, d16
54 ; CHECK-NEXT: vstr d16, [r0]
56 ; CHECK-NEXT: .p2align 3
57 ; CHECK-NEXT: @ %bb.1:
58 ; CHECK-NEXT: .LCPI2_0:
59 ; CHECK-NEXT: .long 3212836864 @ float -1
60 ; CHECK-NEXT: .long 1065353216 @ float 1
62 %c = fadd <2 x float> %a, <float -1.0, float 1.0>
63 %v = bitcast <2 x float> %c to <4 x half>
64 %w = load <4 x half>, <4 x half>* %store
65 %z = fadd <4 x half> %v, %w
66 store <4 x half> %z, <4 x half>* %store
70 define void @conv_v2i32_to_v4f16( <2 x i32> %a, <4 x half>* %store ) {
71 ; CHECK-LABEL: conv_v2i32_to_v4f16:
72 ; CHECK: @ %bb.0: @ %entry
73 ; CHECK-NEXT: vldr d16, .LCPI3_0
74 ; CHECK-NEXT: vrev64.32 d17, d0
75 ; CHECK-NEXT: vrev64.32 d16, d16
76 ; CHECK-NEXT: vadd.i32 d16, d17, d16
77 ; CHECK-NEXT: vldr d18, [r0]
78 ; CHECK-NEXT: vrev64.16 d17, d18
79 ; CHECK-NEXT: vrev32.16 d16, d16
80 ; CHECK-NEXT: vadd.f16 d16, d16, d17
81 ; CHECK-NEXT: vrev64.16 d16, d16
82 ; CHECK-NEXT: vstr d16, [r0]
84 ; CHECK-NEXT: .p2align 3
85 ; CHECK-NEXT: @ %bb.1:
86 ; CHECK-NEXT: .LCPI3_0:
87 ; CHECK-NEXT: .long 1 @ 0x1
88 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
90 %c = add <2 x i32> %a, <i32 1, i32 -1>
91 %v = bitcast <2 x i32> %c to <4 x half>
92 %w = load <4 x half>, <4 x half>* %store
93 %z = fadd <4 x half> %v, %w
94 store <4 x half> %z, <4 x half>* %store
98 define void @conv_v4i16_to_v4f16( <4 x i16> %a, <4 x half>* %store ) {
99 ; CHECK-LABEL: conv_v4i16_to_v4f16:
100 ; CHECK: @ %bb.0: @ %entry
101 ; CHECK-NEXT: vmov.i64 d16, #0xffffffff0000
102 ; CHECK-NEXT: vldr d17, [r0]
103 ; CHECK-NEXT: vrev64.16 d18, d0
104 ; CHECK-NEXT: vrev64.16 d17, d17
105 ; CHECK-NEXT: vrev64.16 d16, d16
106 ; CHECK-NEXT: vadd.i16 d16, d18, d16
107 ; CHECK-NEXT: vadd.f16 d16, d16, d17
108 ; CHECK-NEXT: vrev64.16 d16, d16
109 ; CHECK-NEXT: vstr d16, [r0]
112 %c = add <4 x i16> %a, <i16 -1, i16 0, i16 0, i16 -1>
113 %v = bitcast <4 x i16> %c to <4 x half>
114 %w = load <4 x half>, <4 x half>* %store
115 %z = fadd <4 x half> %v, %w
116 store <4 x half> %z, <4 x half>* %store
120 define void @conv_v8i8_to_v4f16( <8 x i8> %a, <4 x half>* %store ) {
121 ; CHECK-LABEL: conv_v8i8_to_v4f16:
122 ; CHECK: @ %bb.0: @ %entry
123 ; CHECK-NEXT: vmov.i8 d16, #0x1
124 ; CHECK-NEXT: vrev64.8 d17, d0
125 ; CHECK-NEXT: vldr d18, [r0]
126 ; CHECK-NEXT: vadd.i8 d16, d17, d16
127 ; CHECK-NEXT: vrev64.16 d17, d18
128 ; CHECK-NEXT: vrev16.8 d16, d16
129 ; CHECK-NEXT: vadd.f16 d16, d16, d17
130 ; CHECK-NEXT: vrev64.16 d16, d16
131 ; CHECK-NEXT: vstr d16, [r0]
134 %c = add <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
135 %v = bitcast <8 x i8> %c to <4 x half>
136 %w = load <4 x half>, <4 x half>* %store
137 %z = fadd <4 x half> %v, %w
138 store <4 x half> %z, <4 x half>* %store
142 define void @conv_v2i64_to_v8f16( <2 x i64> %val, <8 x half>* %store ) {
143 ; CHECK-LABEL: conv_v2i64_to_v8f16:
144 ; CHECK: @ %bb.0: @ %entry
145 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
146 ; CHECK-NEXT: adr r1, .LCPI6_0
147 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
148 ; CHECK-NEXT: vadd.i64 q9, q0, q9
149 ; CHECK-NEXT: vrev64.16 q8, q8
150 ; CHECK-NEXT: vrev64.16 q9, q9
151 ; CHECK-NEXT: vadd.f16 q8, q9, q8
152 ; CHECK-NEXT: vrev64.16 q8, q8
153 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
155 ; CHECK-NEXT: .p2align 4
156 ; CHECK-NEXT: @ %bb.1:
157 ; CHECK-NEXT: .LCPI6_0:
158 ; CHECK-NEXT: .long 0 @ 0x0
159 ; CHECK-NEXT: .long 1 @ 0x1
160 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
161 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
163 %v = add <2 x i64> %val, <i64 1, i64 -1>
164 %v1 = bitcast <2 x i64> %v to <8 x half>
165 %w = load <8 x half>, <8 x half>* %store
166 %a = fadd <8 x half> %v1, %w
167 store <8 x half> %a, <8 x half>* %store
170 define void @conv_v2f64_to_v8f16( <2 x double> %val, <8 x half>* %store ) {
171 ; CHECK-LABEL: conv_v2f64_to_v8f16:
172 ; CHECK: @ %bb.0: @ %entry
173 ; CHECK-NEXT: vmov.f64 d16, #-1.000000e+00
174 ; CHECK-NEXT: vmov.f64 d17, #1.000000e+00
175 ; CHECK-NEXT: vadd.f64 d19, d1, d16
176 ; CHECK-NEXT: vadd.f64 d18, d0, d17
177 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
178 ; CHECK-NEXT: vrev64.16 q8, q8
179 ; CHECK-NEXT: vrev64.16 q9, q9
180 ; CHECK-NEXT: vadd.f16 q8, q9, q8
181 ; CHECK-NEXT: vrev64.16 q8, q8
182 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
185 %v = fadd <2 x double> %val, <double 1.0, double -1.0>
186 %v1 = bitcast <2 x double> %v to <8 x half>
187 %w = load <8 x half>, <8 x half>* %store
188 %a = fadd <8 x half> %v1, %w
189 store <8 x half> %a, <8 x half>* %store
193 define void @conv_v4f32_to_v8f16( <4 x float> %a, <8 x half>* %store ) {
194 ; CHECK-LABEL: conv_v4f32_to_v8f16:
195 ; CHECK: @ %bb.0: @ %entry
196 ; CHECK-NEXT: adr r1, .LCPI8_0
197 ; CHECK-NEXT: vrev64.32 q9, q0
198 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
199 ; CHECK-NEXT: vrev64.32 q8, q8
200 ; CHECK-NEXT: vadd.f32 q8, q9, q8
201 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
202 ; CHECK-NEXT: vrev64.16 q9, q9
203 ; CHECK-NEXT: vrev32.16 q8, q8
204 ; CHECK-NEXT: vadd.f16 q8, q8, q9
205 ; CHECK-NEXT: vrev64.16 q8, q8
206 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
208 ; CHECK-NEXT: .p2align 4
209 ; CHECK-NEXT: @ %bb.1:
210 ; CHECK-NEXT: .LCPI8_0:
211 ; CHECK-NEXT: .long 3212836864 @ float -1
212 ; CHECK-NEXT: .long 1065353216 @ float 1
213 ; CHECK-NEXT: .long 3212836864 @ float -1
214 ; CHECK-NEXT: .long 1065353216 @ float 1
216 %c = fadd <4 x float> %a, <float -1.0, float 1.0, float -1.0, float 1.0>
217 %v = bitcast <4 x float> %c to <8 x half>
218 %w = load <8 x half>, <8 x half>* %store
219 %z = fadd <8 x half> %v, %w
220 store <8 x half> %z, <8 x half>* %store
224 define void @conv_v4i32_to_v8f16( <4 x i32> %a, <8 x half>* %store ) {
225 ; CHECK-LABEL: conv_v4i32_to_v8f16:
226 ; CHECK: @ %bb.0: @ %entry
227 ; CHECK-NEXT: adr r1, .LCPI9_0
228 ; CHECK-NEXT: vrev64.32 q9, q0
229 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
230 ; CHECK-NEXT: vrev64.32 q8, q8
231 ; CHECK-NEXT: vadd.i32 q8, q9, q8
232 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
233 ; CHECK-NEXT: vrev64.16 q9, q10
234 ; CHECK-NEXT: vrev32.16 q8, q8
235 ; CHECK-NEXT: vadd.f16 q8, q8, q9
236 ; CHECK-NEXT: vrev64.16 q8, q8
237 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
239 ; CHECK-NEXT: .p2align 4
240 ; CHECK-NEXT: @ %bb.1:
241 ; CHECK-NEXT: .LCPI9_0:
242 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
243 ; CHECK-NEXT: .long 1 @ 0x1
244 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
245 ; CHECK-NEXT: .long 1 @ 0x1
247 %c = add <4 x i32> %a, <i32 -1, i32 1, i32 -1, i32 1>
248 %v = bitcast <4 x i32> %c to <8 x half>
249 %w = load <8 x half>, <8 x half>* %store
250 %z = fadd <8 x half> %v, %w
251 store <8 x half> %z, <8 x half>* %store
255 define void @conv_v8i16_to_v8f16( <8 x i16> %a, <8 x half>* %store ) {
256 ; CHECK-LABEL: conv_v8i16_to_v8f16:
257 ; CHECK: @ %bb.0: @ %entry
258 ; CHECK-NEXT: adr r1, .LCPI10_0
259 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
260 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
261 ; CHECK-NEXT: vrev64.16 q10, q0
262 ; CHECK-NEXT: vrev64.16 q8, q8
263 ; CHECK-NEXT: vrev64.16 q9, q9
264 ; CHECK-NEXT: vadd.i16 q8, q10, q8
265 ; CHECK-NEXT: vadd.f16 q8, q8, q9
266 ; CHECK-NEXT: vrev64.16 q8, q8
267 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
269 ; CHECK-NEXT: .p2align 4
270 ; CHECK-NEXT: @ %bb.1:
271 ; CHECK-NEXT: .LCPI10_0:
272 ; CHECK-NEXT: .short 65535 @ 0xffff
273 ; CHECK-NEXT: .short 1 @ 0x1
274 ; CHECK-NEXT: .short 0 @ 0x0
275 ; CHECK-NEXT: .short 7 @ 0x7
276 ; CHECK-NEXT: .short 65535 @ 0xffff
277 ; CHECK-NEXT: .short 1 @ 0x1
278 ; CHECK-NEXT: .short 0 @ 0x0
279 ; CHECK-NEXT: .short 7 @ 0x7
281 %c = add <8 x i16> %a, <i16 -1, i16 1, i16 0, i16 7, i16 -1, i16 1, i16 0, i16 7>
282 %v = bitcast <8 x i16> %c to <8 x half>
283 %w = load <8 x half>, <8 x half>* %store
284 %z = fadd <8 x half> %v, %w
285 store <8 x half> %z, <8 x half>* %store
289 define void @conv_v16i8_to_v8f16( <16 x i8> %a, <8 x half>* %store ) {
290 ; CHECK-LABEL: conv_v16i8_to_v8f16:
291 ; CHECK: @ %bb.0: @ %entry
292 ; CHECK-NEXT: vrev64.8 q8, q0
293 ; CHECK-NEXT: vmov.i8 q9, #0x1
294 ; CHECK-NEXT: vadd.i8 q8, q8, q9
295 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
296 ; CHECK-NEXT: vrev64.16 q9, q10
297 ; CHECK-NEXT: vrev16.8 q8, q8
298 ; CHECK-NEXT: vadd.f16 q8, q8, q9
299 ; CHECK-NEXT: vrev64.16 q8, q8
300 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
303 %c = add <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
304 %v = bitcast <16 x i8> %c to <8 x half>
305 %w = load <8 x half>, <8 x half>* %store
306 %z = fadd <8 x half> %v, %w
307 store <8 x half> %z, <8 x half>* %store
311 define void @conv_v4f16_to_i64( <4 x half> %a, i64* %store ) {
312 ; CHECK-LABEL: conv_v4f16_to_i64:
313 ; CHECK: @ %bb.0: @ %entry
314 ; CHECK-NEXT: vldr d16, .LCPI12_0
315 ; CHECK-NEXT: vrev64.16 d17, d0
316 ; CHECK-NEXT: vrev64.16 d16, d16
317 ; CHECK-NEXT: vadd.f16 d16, d17, d16
318 ; CHECK-NEXT: vrev64.16 d16, d16
319 ; CHECK-NEXT: vmov r1, r2, d16
320 ; CHECK-NEXT: subs r1, r1, #1
321 ; CHECK-NEXT: sbc r2, r2, #0
322 ; CHECK-NEXT: str r2, [r0]
323 ; CHECK-NEXT: str r1, [r0, #4]
325 ; CHECK-NEXT: .p2align 3
326 ; CHECK-NEXT: @ %bb.1:
327 ; CHECK-NEXT: .LCPI12_0:
328 ; CHECK-NEXT: .short 48128 @ half -1
329 ; CHECK-NEXT: .short 15360 @ half 1
330 ; CHECK-NEXT: .short 48128 @ half -1
331 ; CHECK-NEXT: .short 15360 @ half 1
333 %z = fadd <4 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0>
334 %y = bitcast <4 x half> %z to i64
336 store i64 %w, i64* %store
340 define void @conv_v4f16_to_f64( <4 x half> %a, double* %store ) {
341 ; CHECK-LABEL: conv_v4f16_to_f64:
342 ; CHECK: @ %bb.0: @ %entry
343 ; CHECK-NEXT: vldr d16, .LCPI13_0
344 ; CHECK-NEXT: vrev64.16 d17, d0
345 ; CHECK-NEXT: vrev64.16 d16, d16
346 ; CHECK-NEXT: vadd.f16 d16, d17, d16
347 ; CHECK-NEXT: vmov.f64 d17, #-1.000000e+00
348 ; CHECK-NEXT: vrev64.16 d16, d16
349 ; CHECK-NEXT: vadd.f64 d16, d16, d17
350 ; CHECK-NEXT: vstr d16, [r0]
352 ; CHECK-NEXT: .p2align 3
353 ; CHECK-NEXT: @ %bb.1:
354 ; CHECK-NEXT: .LCPI13_0:
355 ; CHECK-NEXT: .short 48128 @ half -1
356 ; CHECK-NEXT: .short 15360 @ half 1
357 ; CHECK-NEXT: .short 48128 @ half -1
358 ; CHECK-NEXT: .short 15360 @ half 1
360 %z = fadd <4 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0>
361 %y = bitcast <4 x half> %z to double
362 %w = fadd double %y, -1.0
363 store double %w, double* %store
367 define void @conv_v4f16_to_v2i32( <4 x half> %a, <2 x i32>* %store ) {
368 ; CHECK-LABEL: conv_v4f16_to_v2i32:
369 ; CHECK: @ %bb.0: @ %entry
370 ; CHECK-NEXT: vldr d16, .LCPI14_0
371 ; CHECK-NEXT: vrev64.16 d17, d0
372 ; CHECK-NEXT: vrev64.16 d16, d16
373 ; CHECK-NEXT: vadd.f16 d16, d17, d16
374 ; CHECK-NEXT: vldr d17, .LCPI14_1
375 ; CHECK-NEXT: vrev64.32 d17, d17
376 ; CHECK-NEXT: vrev32.16 d16, d16
377 ; CHECK-NEXT: vadd.i32 d16, d16, d17
378 ; CHECK-NEXT: vrev64.32 d16, d16
379 ; CHECK-NEXT: vstr d16, [r0]
381 ; CHECK-NEXT: .p2align 3
382 ; CHECK-NEXT: @ %bb.1:
383 ; CHECK-NEXT: .LCPI14_0:
384 ; CHECK-NEXT: .short 48128 @ half -1
385 ; CHECK-NEXT: .short 15360 @ half 1
386 ; CHECK-NEXT: .short 48128 @ half -1
387 ; CHECK-NEXT: .short 15360 @ half 1
388 ; CHECK-NEXT: .LCPI14_1:
389 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
390 ; CHECK-NEXT: .long 1 @ 0x1
392 %z = fadd <4 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0>
393 %y = bitcast <4 x half> %z to <2 x i32>
394 %w = add <2 x i32> %y, <i32 -1, i32 1>
395 store <2 x i32> %w, <2 x i32>* %store
399 define void @conv_v4f16_to_v2f32( <4 x half> %a, <2 x float>* %store ) {
400 ; CHECK-LABEL: conv_v4f16_to_v2f32:
401 ; CHECK: @ %bb.0: @ %entry
402 ; CHECK-NEXT: vldr d16, .LCPI15_0
403 ; CHECK-NEXT: vrev64.16 d17, d0
404 ; CHECK-NEXT: vrev64.16 d16, d16
405 ; CHECK-NEXT: vadd.f16 d16, d17, d16
406 ; CHECK-NEXT: vldr d17, .LCPI15_1
407 ; CHECK-NEXT: vrev64.32 d17, d17
408 ; CHECK-NEXT: vrev32.16 d16, d16
409 ; CHECK-NEXT: vadd.f32 d16, d16, d17
410 ; CHECK-NEXT: vrev64.32 d16, d16
411 ; CHECK-NEXT: vstr d16, [r0]
413 ; CHECK-NEXT: .p2align 3
414 ; CHECK-NEXT: @ %bb.1:
415 ; CHECK-NEXT: .LCPI15_0:
416 ; CHECK-NEXT: .short 48128 @ half -1
417 ; CHECK-NEXT: .short 15360 @ half 1
418 ; CHECK-NEXT: .short 48128 @ half -1
419 ; CHECK-NEXT: .short 15360 @ half 1
420 ; CHECK-NEXT: .LCPI15_1:
421 ; CHECK-NEXT: .long 3212836864 @ float -1
422 ; CHECK-NEXT: .long 1065353216 @ float 1
424 %z = fadd <4 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0>
425 %y = bitcast <4 x half> %z to <2 x float>
426 %w = fadd <2 x float> %y, <float -1.0, float 1.0>
427 store <2 x float> %w, <2 x float>* %store
431 define void @conv_v4f16_to_v4i16( <4 x half> %a, <4 x i16>* %store ) {
432 ; CHECK-LABEL: conv_v4f16_to_v4i16:
433 ; CHECK: @ %bb.0: @ %entry
434 ; CHECK-NEXT: vldr d16, .LCPI16_0
435 ; CHECK-NEXT: vrev64.16 d17, d0
436 ; CHECK-NEXT: vrev64.16 d16, d16
437 ; CHECK-NEXT: vadd.f16 d16, d17, d16
438 ; CHECK-NEXT: vldr d17, .LCPI16_1
439 ; CHECK-NEXT: vrev64.16 d17, d17
440 ; CHECK-NEXT: vadd.i16 d16, d16, d17
441 ; CHECK-NEXT: vrev64.16 d16, d16
442 ; CHECK-NEXT: vstr d16, [r0]
444 ; CHECK-NEXT: .p2align 3
445 ; CHECK-NEXT: @ %bb.1:
446 ; CHECK-NEXT: .LCPI16_0:
447 ; CHECK-NEXT: .short 48128 @ half -1
448 ; CHECK-NEXT: .short 15360 @ half 1
449 ; CHECK-NEXT: .short 48128 @ half -1
450 ; CHECK-NEXT: .short 15360 @ half 1
451 ; CHECK-NEXT: .LCPI16_1:
452 ; CHECK-NEXT: .short 65535 @ 0xffff
453 ; CHECK-NEXT: .short 1 @ 0x1
454 ; CHECK-NEXT: .short 0 @ 0x0
455 ; CHECK-NEXT: .short 7 @ 0x7
457 %z = fadd <4 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0>
458 %y = bitcast <4 x half> %z to <4 x i16>
459 %w = add <4 x i16> %y, <i16 -1, i16 1, i16 0, i16 7>
460 store <4 x i16> %w, <4 x i16>* %store
464 define void @conv_v4f16_to_v8f8( <4 x half> %a, <8 x i8>* %store ) {
465 ; CHECK-LABEL: conv_v4f16_to_v8f8:
466 ; CHECK: @ %bb.0: @ %entry
467 ; CHECK-NEXT: vldr d16, .LCPI17_0
468 ; CHECK-NEXT: vrev64.16 d17, d0
469 ; CHECK-NEXT: vrev64.16 d16, d16
470 ; CHECK-NEXT: vadd.f16 d16, d17, d16
471 ; CHECK-NEXT: vmov.i8 d17, #0x1
472 ; CHECK-NEXT: vrev16.8 d16, d16
473 ; CHECK-NEXT: vadd.i8 d16, d16, d17
474 ; CHECK-NEXT: vrev64.8 d16, d16
475 ; CHECK-NEXT: vstr d16, [r0]
477 ; CHECK-NEXT: .p2align 3
478 ; CHECK-NEXT: @ %bb.1:
479 ; CHECK-NEXT: .LCPI17_0:
480 ; CHECK-NEXT: .short 48128 @ half -1
481 ; CHECK-NEXT: .short 15360 @ half 1
482 ; CHECK-NEXT: .short 48128 @ half -1
483 ; CHECK-NEXT: .short 15360 @ half 1
485 %z = fadd <4 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0>
486 %y = bitcast <4 x half> %z to <8 x i8>
487 %w = add <8 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
488 store <8 x i8> %w, <8 x i8>* %store
492 define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) {
493 ; CHECK-LABEL: conv_v8f16_to_i128:
494 ; CHECK: @ %bb.0: @ %entry
495 ; CHECK-NEXT: adr r1, .LCPI18_0
496 ; CHECK-NEXT: vrev64.16 q9, q0
497 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
498 ; CHECK-NEXT: vrev64.16 q8, q8
499 ; CHECK-NEXT: vadd.f16 q8, q9, q8
500 ; CHECK-NEXT: vrev32.16 q8, q8
501 ; CHECK-NEXT: vmov.32 r12, d17[1]
502 ; CHECK-NEXT: vmov.32 r2, d17[0]
503 ; CHECK-NEXT: vmov.32 r3, d16[1]
504 ; CHECK-NEXT: vmov.32 r1, d16[0]
505 ; CHECK-NEXT: subs r12, r12, #1
506 ; CHECK-NEXT: sbcs r2, r2, #0
507 ; CHECK-NEXT: sbcs r3, r3, #0
508 ; CHECK-NEXT: sbc r1, r1, #0
509 ; CHECK-NEXT: stm r0, {r1, r3}
510 ; CHECK-NEXT: str r2, [r0, #8]
511 ; CHECK-NEXT: str r12, [r0, #12]
513 ; CHECK-NEXT: .p2align 4
514 ; CHECK-NEXT: @ %bb.1:
515 ; CHECK-NEXT: .LCPI18_0:
516 ; CHECK-NEXT: .short 48128 @ half -1
517 ; CHECK-NEXT: .short 15360 @ half 1
518 ; CHECK-NEXT: .short 48128 @ half -1
519 ; CHECK-NEXT: .short 15360 @ half 1
520 ; CHECK-NEXT: .short 48128 @ half -1
521 ; CHECK-NEXT: .short 15360 @ half 1
522 ; CHECK-NEXT: .short 48128 @ half -1
523 ; CHECK-NEXT: .short 15360 @ half 1
525 %z = fadd <8 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0>
526 %y = bitcast <8 x half> %z to i128
528 store i128 %w, i128* %store
532 define void @conv_v8f16_to_v2f64( <8 x half> %a, <2 x double>* %store ) {
533 ; CHECK-LABEL: conv_v8f16_to_v2f64:
534 ; CHECK: @ %bb.0: @ %entry
535 ; CHECK-NEXT: adr r1, .LCPI19_0
536 ; CHECK-NEXT: vrev64.16 q9, q0
537 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
538 ; CHECK-NEXT: vrev64.16 q8, q8
539 ; CHECK-NEXT: vadd.f16 q8, q9, q8
540 ; CHECK-NEXT: vmov.f64 d18, #1.000000e+00
541 ; CHECK-NEXT: vrev64.16 q8, q8
542 ; CHECK-NEXT: vmov.f64 d19, #-1.000000e+00
543 ; CHECK-NEXT: vadd.f64 d21, d17, d18
544 ; CHECK-NEXT: vadd.f64 d20, d16, d19
545 ; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
547 ; CHECK-NEXT: .p2align 4
548 ; CHECK-NEXT: @ %bb.1:
549 ; CHECK-NEXT: .LCPI19_0:
550 ; CHECK-NEXT: .short 48128 @ half -1
551 ; CHECK-NEXT: .short 15360 @ half 1
552 ; CHECK-NEXT: .short 48128 @ half -1
553 ; CHECK-NEXT: .short 15360 @ half 1
554 ; CHECK-NEXT: .short 48128 @ half -1
555 ; CHECK-NEXT: .short 15360 @ half 1
556 ; CHECK-NEXT: .short 48128 @ half -1
557 ; CHECK-NEXT: .short 15360 @ half 1
559 %z = fadd <8 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0>
560 %y = bitcast <8 x half> %z to <2 x double>
561 %w = fadd <2 x double> %y, <double -1.0, double 1.0>
562 store <2 x double> %w, <2 x double>* %store
566 define void @conv_v8f16_to_v4i32( <8 x half> %a, <4 x i32>* %store ) {
567 ; CHECK-LABEL: conv_v8f16_to_v4i32:
568 ; CHECK: @ %bb.0: @ %entry
569 ; CHECK-NEXT: adr r1, .LCPI20_0
570 ; CHECK-NEXT: vrev64.16 q9, q0
571 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
572 ; CHECK-NEXT: adr r1, .LCPI20_1
573 ; CHECK-NEXT: vrev64.16 q8, q8
574 ; CHECK-NEXT: vadd.f16 q8, q9, q8
575 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
576 ; CHECK-NEXT: vrev64.32 q9, q9
577 ; CHECK-NEXT: vrev32.16 q8, q8
578 ; CHECK-NEXT: vadd.i32 q8, q8, q9
579 ; CHECK-NEXT: vrev64.32 q8, q8
580 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
582 ; CHECK-NEXT: .p2align 4
583 ; CHECK-NEXT: @ %bb.1:
584 ; CHECK-NEXT: .LCPI20_0:
585 ; CHECK-NEXT: .short 48128 @ half -1
586 ; CHECK-NEXT: .short 15360 @ half 1
587 ; CHECK-NEXT: .short 48128 @ half -1
588 ; CHECK-NEXT: .short 15360 @ half 1
589 ; CHECK-NEXT: .short 48128 @ half -1
590 ; CHECK-NEXT: .short 15360 @ half 1
591 ; CHECK-NEXT: .short 48128 @ half -1
592 ; CHECK-NEXT: .short 15360 @ half 1
593 ; CHECK-NEXT: .LCPI20_1:
594 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
595 ; CHECK-NEXT: .long 1 @ 0x1
596 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
597 ; CHECK-NEXT: .long 1 @ 0x1
599 %z = fadd <8 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0>
600 %y = bitcast <8 x half> %z to <4 x i32>
601 %w = add <4 x i32> %y, <i32 -1, i32 1, i32 -1, i32 1>
602 store <4 x i32> %w, <4 x i32>* %store
606 define void @conv_v8f16_to_v4f32( <8 x half> %a, <4 x float>* %store ) {
607 ; CHECK-LABEL: conv_v8f16_to_v4f32:
608 ; CHECK: @ %bb.0: @ %entry
609 ; CHECK-NEXT: adr r1, .LCPI21_0
610 ; CHECK-NEXT: vrev64.16 q9, q0
611 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
612 ; CHECK-NEXT: adr r1, .LCPI21_1
613 ; CHECK-NEXT: vrev64.16 q8, q8
614 ; CHECK-NEXT: vadd.f16 q8, q9, q8
615 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
616 ; CHECK-NEXT: vrev64.32 q9, q9
617 ; CHECK-NEXT: vrev32.16 q8, q8
618 ; CHECK-NEXT: vadd.f32 q8, q8, q9
619 ; CHECK-NEXT: vrev64.32 q8, q8
620 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
622 ; CHECK-NEXT: .p2align 4
623 ; CHECK-NEXT: @ %bb.1:
624 ; CHECK-NEXT: .LCPI21_0:
625 ; CHECK-NEXT: .short 48128 @ half -1
626 ; CHECK-NEXT: .short 15360 @ half 1
627 ; CHECK-NEXT: .short 48128 @ half -1
628 ; CHECK-NEXT: .short 15360 @ half 1
629 ; CHECK-NEXT: .short 48128 @ half -1
630 ; CHECK-NEXT: .short 15360 @ half 1
631 ; CHECK-NEXT: .short 48128 @ half -1
632 ; CHECK-NEXT: .short 15360 @ half 1
633 ; CHECK-NEXT: .LCPI21_1:
634 ; CHECK-NEXT: .long 3212836864 @ float -1
635 ; CHECK-NEXT: .long 1065353216 @ float 1
636 ; CHECK-NEXT: .long 3212836864 @ float -1
637 ; CHECK-NEXT: .long 1065353216 @ float 1
639 %z = fadd <8 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0>
640 %y = bitcast <8 x half> %z to <4 x float>
641 %w = fadd <4 x float> %y, <float -1.0, float 1.0, float -1.0, float 1.0>
642 store <4 x float> %w, <4 x float>* %store
646 define void @conv_v8f16_to_v8i16( <8 x half> %a, <8 x i16>* %store ) {
647 ; CHECK-LABEL: conv_v8f16_to_v8i16:
648 ; CHECK: @ %bb.0: @ %entry
649 ; CHECK-NEXT: adr r1, .LCPI22_0
650 ; CHECK-NEXT: vrev64.16 q9, q0
651 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
652 ; CHECK-NEXT: adr r1, .LCPI22_1
653 ; CHECK-NEXT: vrev64.16 q8, q8
654 ; CHECK-NEXT: vadd.f16 q8, q9, q8
655 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]
656 ; CHECK-NEXT: vrev64.16 q9, q9
657 ; CHECK-NEXT: vadd.i16 q8, q8, q9
658 ; CHECK-NEXT: vrev64.16 q8, q8
659 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
661 ; CHECK-NEXT: .p2align 4
662 ; CHECK-NEXT: @ %bb.1:
663 ; CHECK-NEXT: .LCPI22_0:
664 ; CHECK-NEXT: .short 48128 @ half -1
665 ; CHECK-NEXT: .short 15360 @ half 1
666 ; CHECK-NEXT: .short 48128 @ half -1
667 ; CHECK-NEXT: .short 15360 @ half 1
668 ; CHECK-NEXT: .short 48128 @ half -1
669 ; CHECK-NEXT: .short 15360 @ half 1
670 ; CHECK-NEXT: .short 48128 @ half -1
671 ; CHECK-NEXT: .short 15360 @ half 1
672 ; CHECK-NEXT: .LCPI22_1:
673 ; CHECK-NEXT: .short 65535 @ 0xffff
674 ; CHECK-NEXT: .short 1 @ 0x1
675 ; CHECK-NEXT: .short 0 @ 0x0
676 ; CHECK-NEXT: .short 7 @ 0x7
677 ; CHECK-NEXT: .short 65535 @ 0xffff
678 ; CHECK-NEXT: .short 1 @ 0x1
679 ; CHECK-NEXT: .short 0 @ 0x0
680 ; CHECK-NEXT: .short 7 @ 0x7
682 %z = fadd <8 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0>
683 %y = bitcast <8 x half> %z to <8 x i16>
684 %w = add <8 x i16> %y, <i16 -1, i16 1, i16 0, i16 7, i16 -1, i16 1, i16 0, i16 7>
685 store <8 x i16> %w, <8 x i16>* %store
689 define void @conv_v8f16_to_v8f8( <8 x half> %a, <16 x i8>* %store ) {
690 ; CHECK-LABEL: conv_v8f16_to_v8f8:
691 ; CHECK: @ %bb.0: @ %entry
692 ; CHECK-NEXT: adr r1, .LCPI23_0
693 ; CHECK-NEXT: vrev64.16 q9, q0
694 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
695 ; CHECK-NEXT: vrev64.16 q8, q8
696 ; CHECK-NEXT: vadd.f16 q8, q9, q8
697 ; CHECK-NEXT: vmov.i8 q9, #0x1
698 ; CHECK-NEXT: vrev16.8 q8, q8
699 ; CHECK-NEXT: vadd.i8 q8, q8, q9
700 ; CHECK-NEXT: vrev64.8 q8, q8
701 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
703 ; CHECK-NEXT: .p2align 4
704 ; CHECK-NEXT: @ %bb.1:
705 ; CHECK-NEXT: .LCPI23_0:
706 ; CHECK-NEXT: .short 48128 @ half -1
707 ; CHECK-NEXT: .short 15360 @ half 1
708 ; CHECK-NEXT: .short 48128 @ half -1
709 ; CHECK-NEXT: .short 15360 @ half 1
710 ; CHECK-NEXT: .short 48128 @ half -1
711 ; CHECK-NEXT: .short 15360 @ half 1
712 ; CHECK-NEXT: .short 48128 @ half -1
713 ; CHECK-NEXT: .short 15360 @ half 1
715 %z = fadd <8 x half> %a, <half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0, half -1.0, half 1.0>
716 %y = bitcast <8 x half> %z to <16 x i8>
717 %w = add <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
718 store <16 x i8> %w, <16 x i8>* %store