Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / arm-bf16-convert-intrinsics.c
blobf50eaf371028cde8e0d6d5cd8dddc85213852341
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 \
3 // RUN: -triple aarch64 -target-feature +neon -target-feature +bf16 \
4 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
5 // RUN: | opt -S -passes=mem2reg \
6 // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A64 %s
7 // RUN: %clang_cc1 \
8 // RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \
9 // RUN: -target-feature +bf16 -mfloat-abi hard \
10 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
11 // RUN: | opt -S -passes=mem2reg \
12 // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-HARDFP %s
13 // RUN: %clang_cc1 \
14 // RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \
15 // RUN: -target-feature +bf16 -mfloat-abi softfp \
16 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
17 // RUN: | opt -S -passes=mem2reg \
18 // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-SOFTFP %s
20 // REQUIRES: arm-registered-target
21 // REQUIRES: aarch64-registered-target
23 #include <arm_neon.h>
25 // CHECK-A64-LABEL: @test_vcvt_f32_bf16(
26 // CHECK-A64-NEXT: entry:
27 // CHECK-A64-NEXT: [[__REINT_150_I:%.*]] = alloca <4 x bfloat>, align 8
28 // CHECK-A64-NEXT: [[__REINT1_150_I:%.*]] = alloca <4 x i32>, align 16
29 // CHECK-A64-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_150_I]], align 8
30 // CHECK-A64-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I]], align 8
31 // CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
32 // CHECK-A64-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
33 // CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
34 // CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_150_I]], align 16
35 // CHECK-A64-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I]], align 16
36 // CHECK-A64-NEXT: ret <4 x float> [[TMP5]]
38 // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
39 // CHECK-A32-HARDFP-NEXT: entry:
40 // CHECK-A32-HARDFP-NEXT: [[__REINT_150_I:%.*]] = alloca <4 x bfloat>, align 8
41 // CHECK-A32-HARDFP-NEXT: [[__REINT1_150_I:%.*]] = alloca <4 x i32>, align 8
42 // CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_150_I]], align 8
43 // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I]], align 8
44 // CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
45 // CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
46 // CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
47 // CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_150_I]], align 8
48 // CHECK-A32-HARDFP-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I]], align 8
49 // CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP5]]
51 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
52 // CHECK-A32-SOFTFP-NEXT: entry:
53 // CHECK-A32-SOFTFP-NEXT: [[__P0_150_I:%.*]] = alloca <4 x bfloat>, align 8
54 // CHECK-A32-SOFTFP-NEXT: [[__REINT_150_I:%.*]] = alloca <4 x bfloat>, align 8
55 // CHECK-A32-SOFTFP-NEXT: [[__REINT1_150_I:%.*]] = alloca <4 x i32>, align 8
56 // CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <4 x bfloat>, align 8
57 // CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
58 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
59 // CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <4 x bfloat>, ptr [[A]], align 8
60 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[A1]], ptr [[COERCE]], align 8
61 // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8
62 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[__P0_150_I]], align 8
63 // CHECK-A32-SOFTFP-NEXT: [[__P0_1501_I:%.*]] = load <4 x bfloat>, ptr [[__P0_150_I]], align 8
64 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_1501_I]], ptr [[__REINT_150_I]], align 8
65 // CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_150_I]], align 8
66 // CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
67 // CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
68 // CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP7]], <i32 16, i32 16, i32 16, i32 16>
69 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_150_I]], align 8
70 // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[__REINT1_150_I]], align 8
71 // CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP9]]
73 float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
74 return vcvt_f32_bf16(a);
77 // CHECK-A64-LABEL: @test_vcvtq_low_f32_bf16(
78 // CHECK-A64-NEXT: entry:
79 // CHECK-A64-NEXT: [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
80 // CHECK-A64-NEXT: [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 16
81 // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
82 // CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
83 // CHECK-A64-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
84 // CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
85 // CHECK-A64-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
86 // CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
87 // CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 16
88 // CHECK-A64-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 16
89 // CHECK-A64-NEXT: ret <4 x float> [[TMP5]]
91 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
92 // CHECK-A32-HARDFP-NEXT: entry:
93 // CHECK-A32-HARDFP-NEXT: [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
94 // CHECK-A32-HARDFP-NEXT: [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
95 // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
96 // CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
97 // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
98 // CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
99 // CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
100 // CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
101 // CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
102 // CHECK-A32-HARDFP-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
103 // CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP5]]
105 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
106 // CHECK-A32-SOFTFP-NEXT: entry:
107 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
108 // CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
109 // CHECK-A32-SOFTFP-NEXT: [[__P0_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
110 // CHECK-A32-SOFTFP-NEXT: [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
111 // CHECK-A32-SOFTFP-NEXT: [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
112 // CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
113 // CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
114 // CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
115 // CHECK-A32-SOFTFP-NEXT: [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8
116 // CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <8 x bfloat>, align 8
117 // CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
118 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
119 // CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8
120 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8
121 // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
122 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP2]], ptr [[__P0_I]], align 8
123 // CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
124 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8
125 // CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
126 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP5]], ptr [[__P0_I2]], align 8
127 // CHECK-A32-SOFTFP-NEXT: [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8
128 // CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
129 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8
130 // CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
131 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP8]], ptr [[COERCE2_I]], align 8
132 // CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
133 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP10]], ptr [[COERCE3_I]], align 8
134 // CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
135 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP12]], ptr [[__P0_150_I_I]], align 8
136 // CHECK-A32-SOFTFP-NEXT: [[__P0_1501_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_150_I_I]], align 8
137 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_1501_I_I]], ptr [[__REINT_150_I_I]], align 8
138 // CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
139 // CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to <8 x i8>
140 // CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[TMP15]] to <4 x i32>
141 // CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP17]], <i32 16, i32 16, i32 16, i32 16>
142 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
143 // CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
144 // CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP19]]
146 float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
147 return vcvtq_low_f32_bf16(a);
150 // CHECK-A64-LABEL: @test_vcvtq_high_f32_bf16(
151 // CHECK-A64-NEXT: entry:
152 // CHECK-A64-NEXT: [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
153 // CHECK-A64-NEXT: [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 16
154 // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
155 // CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
156 // CHECK-A64-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
157 // CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
158 // CHECK-A64-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
159 // CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
160 // CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 16
161 // CHECK-A64-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 16
162 // CHECK-A64-NEXT: ret <4 x float> [[TMP5]]
164 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
165 // CHECK-A32-HARDFP-NEXT: entry:
166 // CHECK-A32-HARDFP-NEXT: [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
167 // CHECK-A32-HARDFP-NEXT: [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
168 // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
169 // CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
170 // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
171 // CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
172 // CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
173 // CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
174 // CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
175 // CHECK-A32-HARDFP-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
176 // CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP5]]
178 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
179 // CHECK-A32-SOFTFP-NEXT: entry:
180 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
181 // CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
182 // CHECK-A32-SOFTFP-NEXT: [[__P0_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
183 // CHECK-A32-SOFTFP-NEXT: [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
184 // CHECK-A32-SOFTFP-NEXT: [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
185 // CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
186 // CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
187 // CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
188 // CHECK-A32-SOFTFP-NEXT: [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8
189 // CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <8 x bfloat>, align 8
190 // CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
191 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
192 // CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8
193 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8
194 // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
195 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP2]], ptr [[__P0_I]], align 8
196 // CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
197 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8
198 // CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
199 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP5]], ptr [[__P0_I2]], align 8
200 // CHECK-A32-SOFTFP-NEXT: [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8
201 // CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
202 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8
203 // CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
204 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP8]], ptr [[COERCE2_I]], align 8
205 // CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
206 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP10]], ptr [[COERCE3_I]], align 8
207 // CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
208 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP12]], ptr [[__P0_150_I_I]], align 8
209 // CHECK-A32-SOFTFP-NEXT: [[__P0_1501_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_150_I_I]], align 8
210 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_1501_I_I]], ptr [[__REINT_150_I_I]], align 8
211 // CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
212 // CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to <8 x i8>
213 // CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[TMP15]] to <4 x i32>
214 // CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP17]], <i32 16, i32 16, i32 16, i32 16>
215 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
216 // CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
217 // CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP19]]
219 float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
220 return vcvtq_high_f32_bf16(a);
223 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
224 // CHECK-A64-NEXT: entry:
225 // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
226 // CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
227 // CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
228 // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
229 // CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
231 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
232 // CHECK-A32-HARDFP-NEXT: entry:
233 // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
234 // CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
235 // CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]]
237 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
238 // CHECK-A32-SOFTFP-NEXT: entry:
239 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8
240 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
241 // CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8
242 // CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <4 x bfloat>, align 8
243 // CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
244 // CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
245 // CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
246 // CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
247 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8
248 // CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
249 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8
250 // CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
251 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP5]], ptr [[RETVAL_I]], align 8
252 // CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
253 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP7]], ptr [[COERCE]], align 8
254 // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <4 x bfloat>, ptr [[COERCE]], align 8
255 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP9]], ptr [[RETVAL]], align 8
256 // CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
257 // CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP11]]
259 bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
260 return vcvt_bf16_f32(a);
263 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
264 // CHECK-A64-NEXT: entry:
265 // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
266 // CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
267 // CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
268 // CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]]
270 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
271 // CHECK-A32-HARDFP-NEXT: entry:
272 // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
273 // CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
274 // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
275 // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]]
277 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32(
278 // CHECK-A32-SOFTFP-NEXT: entry:
279 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I4:%.*]] = alloca <8 x bfloat>, align 8
280 // CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <4 x bfloat>, align 8
281 // CHECK-A32-SOFTFP-NEXT: [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8
282 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8
283 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8
284 // CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8
285 // CHECK-A32-SOFTFP-NEXT: [[COERCE1_I:%.*]] = alloca <4 x bfloat>, align 8
286 // CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
287 // CHECK-A32-SOFTFP-NEXT: [[COERCE4_I:%.*]] = alloca <8 x bfloat>, align 8
288 // CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8
289 // CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
290 // CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
291 // CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
292 // CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
293 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8
294 // CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
295 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8
296 // CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
297 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> zeroinitializer, ptr [[COERCE1_I]], align 8
298 // CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[COERCE1_I]], align 8
299 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP5]], ptr [[COERCE2_I]], align 8
300 // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[COERCE2_I]], align 8
301 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP7]], ptr [[__P0_I]], align 8
302 // CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <4 x bfloat>, ptr [[__P0_I]], align 8
303 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP9]], ptr [[__P1_I]], align 8
304 // CHECK-A32-SOFTFP-NEXT: [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8
305 // CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[__P01_I]], <4 x bfloat> [[__P12_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
306 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I4]], align 8
307 // CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr [[RETVAL_I4]], align 8
308 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP13]], ptr [[COERCE4_I]], align 8
309 // CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = load <8 x bfloat>, ptr [[COERCE4_I]], align 8
310 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP15]], ptr [[RETVAL_I]], align 8
311 // CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
312 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP17]], ptr [[COERCE]], align 8
313 // CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = load <8 x bfloat>, ptr [[COERCE]], align 8
314 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP19]], ptr [[RETVAL]], align 8
315 // CHECK-A32-SOFTFP-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
316 // CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP21]]
318 bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
319 return vcvtq_low_bf16_f32(a);
322 // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32(
323 // CHECK-A64-NEXT: entry:
324 // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
325 // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
326 // CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_V2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
327 // CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_V3_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]] to <16 x i8>
328 // CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]]
330 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
331 // CHECK-A32-HARDFP-NEXT: entry:
332 // CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
333 // CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
334 // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
335 // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
336 // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]]
338 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_bf16_f32(
339 // CHECK-A32-SOFTFP-NEXT: entry:
340 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I11:%.*]] = alloca <8 x bfloat>, align 8
341 // CHECK-A32-SOFTFP-NEXT: [[__P0_I12:%.*]] = alloca <4 x bfloat>, align 8
342 // CHECK-A32-SOFTFP-NEXT: [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8
343 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I8:%.*]] = alloca <4 x bfloat>, align 8
344 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I3:%.*]] = alloca <4 x bfloat>, align 8
345 // CHECK-A32-SOFTFP-NEXT: [[__P0_I4:%.*]] = alloca <8 x bfloat>, align 8
346 // CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8
347 // CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
348 // CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8
349 // CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <8 x bfloat>, align 8
350 // CHECK-A32-SOFTFP-NEXT: [[COERCE4_I:%.*]] = alloca <4 x bfloat>, align 8
351 // CHECK-A32-SOFTFP-NEXT: [[COERCE5_I:%.*]] = alloca <4 x bfloat>, align 8
352 // CHECK-A32-SOFTFP-NEXT: [[COERCE6_I:%.*]] = alloca <4 x bfloat>, align 8
353 // CHECK-A32-SOFTFP-NEXT: [[COERCE8_I:%.*]] = alloca <8 x bfloat>, align 8
354 // CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8
355 // CHECK-A32-SOFTFP-NEXT: [[INACTIVE:%.*]] = alloca <8 x bfloat>, align 8
356 // CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
357 // CHECK-A32-SOFTFP-NEXT: [[COERCE2:%.*]] = alloca <8 x bfloat>, align 8
358 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[INACTIVE_COERCE:%.*]], ptr [[INACTIVE]], align 8
359 // CHECK-A32-SOFTFP-NEXT: [[INACTIVE1:%.*]] = load <8 x bfloat>, ptr [[INACTIVE]], align 8
360 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[INACTIVE1]], ptr [[COERCE]], align 8
361 // CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
362 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP2]], ptr [[__P0_I]], align 8
363 // CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
364 // CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
365 // CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
366 // CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
367 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP5]], ptr [[RETVAL_I8]], align 8
368 // CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[RETVAL_I8]], align 8
369 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP7]], ptr [[COERCE_I]], align 8
370 // CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
371 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE2_I]], align 8
372 // CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr [[COERCE2_I]], align 8
373 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP11]], ptr [[__P0_I4]], align 8
374 // CHECK-A32-SOFTFP-NEXT: [[__P01_I7:%.*]] = load <8 x bfloat>, ptr [[__P0_I4]], align 8
375 // CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I7]], <8 x bfloat> [[__P01_I7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
376 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I3]], align 8
377 // CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[RETVAL_I3]], align 8
378 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP14]], ptr [[COERCE4_I]], align 8
379 // CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = load <4 x bfloat>, ptr [[COERCE4_I]], align 8
380 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP9]], ptr [[COERCE5_I]], align 8
381 // CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[COERCE5_I]], align 8
382 // CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP16]], ptr [[COERCE6_I]], align 8
383 // CHECK-A32-SOFTFP-NEXT: [[TMP20:%.*]] = load <2 x i32>, ptr [[COERCE6_I]], align 8
384 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP18]], ptr [[__P0_I12]], align 8
385 // CHECK-A32-SOFTFP-NEXT: [[__P01_I16:%.*]] = load <4 x bfloat>, ptr [[__P0_I12]], align 8
386 // CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP20]], ptr [[__P1_I]], align 8
387 // CHECK-A32-SOFTFP-NEXT: [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8
388 // CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[__P01_I16]], <4 x bfloat> [[__P12_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
389 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[SHUFFLE_I17]], ptr [[RETVAL_I11]], align 8
390 // CHECK-A32-SOFTFP-NEXT: [[TMP24:%.*]] = load <4 x i32>, ptr [[RETVAL_I11]], align 8
391 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP24]], ptr [[COERCE8_I]], align 8
392 // CHECK-A32-SOFTFP-NEXT: [[TMP26:%.*]] = load <8 x bfloat>, ptr [[COERCE8_I]], align 8
393 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP26]], ptr [[RETVAL_I]], align 8
394 // CHECK-A32-SOFTFP-NEXT: [[TMP28:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
395 // CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP28]], ptr [[COERCE2]], align 8
396 // CHECK-A32-SOFTFP-NEXT: [[TMP30:%.*]] = load <8 x bfloat>, ptr [[COERCE2]], align 8
397 // CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP30]], ptr [[RETVAL]], align 8
398 // CHECK-A32-SOFTFP-NEXT: [[TMP32:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
399 // CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP32]]
401 bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
402 return vcvtq_high_bf16_f32(inactive, a);
405 // CHECK-A64-LABEL: @test_vcvth_bf16_f32(
406 // CHECK-A64-NEXT: entry:
407 // CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
408 // CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
410 // CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
411 // CHECK-A32-HARDFP-NEXT: entry:
412 // CHECK-A32-HARDFP-NEXT: [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]])
413 // CHECK-A32-HARDFP-NEXT: ret bfloat [[VCVTBFP2BF_I]]
415 // CHECK-A32-SOFTFP-LABEL: @test_vcvth_bf16_f32(
416 // CHECK-A32-SOFTFP-NEXT: entry:
417 // CHECK-A32-SOFTFP-NEXT: [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]])
418 // CHECK-A32-SOFTFP-NEXT: ret bfloat [[VCVTBFP2BF_I]]
420 bfloat16_t test_vcvth_bf16_f32(float32_t a) {
421 return vcvth_bf16_f32(a);
424 // CHECK-LABEL: @test_vcvtah_f32_bf16(
425 // CHECK-NEXT: entry:
426 // CHECK-NEXT: [[__REINT_I:%.*]] = alloca bfloat, align 2
427 // CHECK-NEXT: [[__REINT1_I:%.*]] = alloca i32, align 4
428 // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2
429 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[__REINT_I]], align 2
430 // CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[TMP1]], 16
431 // CHECK-NEXT: store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4
432 // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[__REINT1_I]], align 4
433 // CHECK-NEXT: ret float [[TMP3]]
435 float32_t test_vcvtah_f32_bf16(bfloat16_t a) {
436 return vcvtah_f32_bf16(a);