1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,fullfp16 < %s | FileCheck %s
3 ; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops
6 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
7 target triple = "armv8.6a-arm-none-eabi"
9 define arm_aapcs_vfpcc <4 x bfloat> @test_vcreate_bf16(i64 %a) {
10 ; CHECK-LABEL: test_vcreate_bf16:
11 ; CHECK: @ %bb.0: @ %entry
12 ; CHECK-NEXT: vmov d0, r0, r1
15 %0 = bitcast i64 %a to <4 x bfloat>
19 define arm_aapcs_vfpcc <4 x bfloat> @test_vdup_n_bf16(bfloat %v) {
20 ; CHECK-LABEL: test_vdup_n_bf16:
21 ; CHECK: @ %bb.0: @ %entry
22 ; CHECK-NEXT: @ kill: def $s0 killed $s0 def $d0
23 ; CHECK-NEXT: vdup.16 d0, d0[0]
26 %vecinit.i = insertelement <4 x bfloat> undef, bfloat %v, i32 0
27 %vecinit3.i = shufflevector <4 x bfloat> %vecinit.i, <4 x bfloat> undef, <4 x i32> zeroinitializer
28 ret <4 x bfloat> %vecinit3.i
31 define arm_aapcs_vfpcc <8 x bfloat> @test_vdupq_n_bf16(bfloat %v) {
32 ; CHECK-LABEL: test_vdupq_n_bf16:
33 ; CHECK: @ %bb.0: @ %entry
34 ; CHECK-NEXT: @ kill: def $s0 killed $s0 def $d0
35 ; CHECK-NEXT: vdup.16 q0, d0[0]
38 %vecinit.i = insertelement <8 x bfloat> undef, bfloat %v, i32 0
39 %vecinit7.i = shufflevector <8 x bfloat> %vecinit.i, <8 x bfloat> undef, <8 x i32> zeroinitializer
40 ret <8 x bfloat> %vecinit7.i
43 define arm_aapcs_vfpcc <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %v) {
44 ; CHECK-LABEL: test_vdup_lane_bf16:
45 ; CHECK: @ %bb.0: @ %entry
46 ; CHECK-NEXT: vdup.16 d0, d0[1]
49 %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
50 ret <4 x bfloat> %lane
53 define arm_aapcs_vfpcc <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %v) {
54 ; CHECK-LABEL: test_vdupq_lane_bf16:
55 ; CHECK: @ %bb.0: @ %entry
56 ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0
57 ; CHECK-NEXT: vdup.16 q0, d0[1]
60 %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
61 ret <8 x bfloat> %lane
64 define arm_aapcs_vfpcc <4 x bfloat> @test_vdup_laneq_bf16(<8 x bfloat> %v) {
65 ; CHECK-LABEL: test_vdup_laneq_bf16:
66 ; CHECK: @ %bb.0: @ %entry
67 ; CHECK-NEXT: vdup.16 d0, d1[3]
70 %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
71 ret <4 x bfloat> %lane
74 define arm_aapcs_vfpcc <8 x bfloat> @test_vdupq_laneq_bf16(<8 x bfloat> %v) {
75 ; CHECK-LABEL: test_vdupq_laneq_bf16:
76 ; CHECK: @ %bb.0: @ %entry
77 ; CHECK-NEXT: vdup.16 q0, d1[3]
80 %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
81 ret <8 x bfloat> %lane
84 define arm_aapcs_vfpcc <8 x bfloat> @test_vcombine_bf16(<4 x bfloat> %low, <4 x bfloat> %high) {
85 ; CHECK-LABEL: test_vcombine_bf16:
86 ; CHECK: @ %bb.0: @ %entry
87 ; CHECK-NEXT: vmov.f64 d16, d1
88 ; CHECK-NEXT: vorr d17, d0, d0
89 ; CHECK-NEXT: vorr q0, q8, q8
92 %shuffle.i = shufflevector <4 x bfloat> %high, <4 x bfloat> %low, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
93 ret <8 x bfloat> %shuffle.i
96 define arm_aapcs_vfpcc <4 x bfloat> @test_vget_high_bf16(<8 x bfloat> %a) {
97 ; CHECK-LABEL: test_vget_high_bf16:
98 ; CHECK: @ %bb.0: @ %entry
99 ; CHECK-NEXT: vmov.f64 d0, d1
102 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
103 ret <4 x bfloat> %shuffle.i
106 define arm_aapcs_vfpcc <4 x bfloat> @test_vget_low_bf16(<8 x bfloat> %a) {
107 ; CHECK-LABEL: test_vget_low_bf16:
108 ; CHECK: @ %bb.0: @ %entry
109 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0
112 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
113 ret <4 x bfloat> %shuffle.i
116 define arm_aapcs_vfpcc bfloat @test_vgetq_lane_bf16_even(<8 x bfloat> %v) {
117 ; CHECK-LABEL: test_vgetq_lane_bf16_even:
118 ; CHECK: @ %bb.0: @ %entry
119 ; CHECK-NEXT: vmov.f32 s0, s3
122 %0 = extractelement <8 x bfloat> %v, i32 6
126 define arm_aapcs_vfpcc bfloat @test_vgetq_lane_bf16_odd(<8 x bfloat> %v) {
127 ; CHECK-LABEL: test_vgetq_lane_bf16_odd:
128 ; CHECK: @ %bb.0: @ %entry
129 ; CHECK-NEXT: vmovx.f16 s0, s3
132 %0 = extractelement <8 x bfloat> %v, i32 7
136 define arm_aapcs_vfpcc bfloat @test_vget_lane_bf16_even(<4 x bfloat> %v) {
137 ; CHECK-LABEL: test_vget_lane_bf16_even:
138 ; CHECK: @ %bb.0: @ %entry
139 ; CHECK-NEXT: vmov.f32 s0, s1
142 %0 = extractelement <4 x bfloat> %v, i32 2
146 define arm_aapcs_vfpcc bfloat @test_vget_lane_bf16_odd(<4 x bfloat> %v) {
147 ; CHECK-LABEL: test_vget_lane_bf16_odd:
148 ; CHECK: @ %bb.0: @ %entry
149 ; CHECK-NEXT: vmovx.f16 s0, s0
152 %0 = extractelement <4 x bfloat> %v, i32 1
156 define arm_aapcs_vfpcc <4 x bfloat> @test_vset_lane_bf16(bfloat %a, <4 x bfloat> %v) {
157 ; CHECK-LABEL: test_vset_lane_bf16:
158 ; CHECK: @ %bb.0: @ %entry
159 ; CHECK-NEXT: vmov r0, s0
160 ; CHECK-NEXT: vmov.16 d1[1], r0
161 ; CHECK-NEXT: vorr d0, d1, d1
164 %0 = insertelement <4 x bfloat> %v, bfloat %a, i32 1
168 define arm_aapcs_vfpcc <8 x bfloat> @test_vsetq_lane_bf16(bfloat %a, <8 x bfloat> %v) {
169 ; CHECK-LABEL: test_vsetq_lane_bf16:
170 ; CHECK: @ %bb.0: @ %entry
171 ; CHECK-NEXT: vmov r0, s0
172 ; CHECK-NEXT: vmov.16 d3[3], r0
173 ; CHECK-NEXT: vorr q0, q1, q1
176 %0 = insertelement <8 x bfloat> %v, bfloat %a, i32 7