Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / aarch64-bf16-dotprod-intrinsics.c
blob0f4d9558080540c9648cc46c8f946d310c7c50ea
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
3 // RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
5 // REQUIRES: aarch64-registered-target || arm-registered-target
7 #include <arm_neon.h>
9 // CHECK-LABEL: @test_vbfdot_f32(
10 // CHECK-NEXT: entry:
11 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
12 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
13 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
14 // CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
15 // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
17 float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
18 return vbfdot_f32(r, a, b);
21 // CHECK-LABEL: @test_vbfdotq_f32(
22 // CHECK-NEXT: entry:
23 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
24 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
25 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
26 // CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
27 // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
29 float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
30 return vbfdotq_f32(r, a, b);
33 // CHECK-LABEL: @test_vbfdot_lane_f32(
34 // CHECK-NEXT: entry:
35 // CHECK-NEXT: [[__REINT_144:%.*]] = alloca <4 x bfloat>, align 8
36 // CHECK-NEXT: [[__REINT1_144:%.*]] = alloca <2 x float>, align 8
37 // CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_144]], align 8
38 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT_144]], align 8
39 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8>
40 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
41 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
42 // CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_144]], align 8
43 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x bfloat>, ptr [[__REINT1_144]], align 8
44 // CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
45 // CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
46 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
47 // CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
48 // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
50 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
51 return vbfdot_lane_f32(r, a, b, 0);
54 // CHECK-LABEL: @test_vbfdotq_laneq_f32(
55 // CHECK-NEXT: entry:
56 // CHECK-NEXT: [[__REINT_146:%.*]] = alloca <8 x bfloat>, align 16
57 // CHECK-NEXT: [[__REINT1_146:%.*]] = alloca <4 x float>, align 16
58 // CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_146]], align 16
59 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[__REINT_146]], align 16
60 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <16 x i8>
61 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
62 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
63 // CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_146]], align 16
64 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x bfloat>, ptr [[__REINT1_146]], align 16
65 // CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
66 // CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
67 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
68 // CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
69 // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
71 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
72 return vbfdotq_laneq_f32(r, a, b, 3);
75 // CHECK-LABEL: @test_vbfdot_laneq_f32(
76 // CHECK-NEXT: entry:
77 // CHECK-NEXT: [[__REINT_148:%.*]] = alloca <8 x bfloat>, align 16
78 // CHECK-NEXT: [[__REINT1_148:%.*]] = alloca <2 x float>, align 8
79 // CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_148]], align 16
80 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[__REINT_148]], align 16
81 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <16 x i8>
82 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
83 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <2 x i32> <i32 3, i32 3>
84 // CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_148]], align 8
85 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x bfloat>, ptr [[__REINT1_148]], align 8
86 // CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
87 // CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
88 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
89 // CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
90 // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
92 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
93 return vbfdot_laneq_f32(r, a, b, 3);
96 // CHECK-LABEL: @test_vbfdotq_lane_f32(
97 // CHECK-NEXT: entry:
98 // CHECK-NEXT: [[__REINT_142:%.*]] = alloca <4 x bfloat>, align 8
99 // CHECK-NEXT: [[__REINT1_142:%.*]] = alloca <4 x float>, align 16
100 // CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_142]], align 8
101 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT_142]], align 8
102 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8>
103 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
104 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
105 // CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_142]], align 16
106 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x bfloat>, ptr [[__REINT1_142]], align 16
107 // CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
108 // CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
109 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
110 // CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
111 // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
113 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
114 return vbfdotq_lane_f32(r, a, b, 0);
117 // CHECK-LABEL: @test_vbfmmlaq_f32(
118 // CHECK-NEXT: entry:
119 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
120 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
121 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
122 // CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
123 // CHECK-NEXT: [[VBFMMLAQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_V3_I]] to <16 x i8>
124 // CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
126 float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
127 return vbfmmlaq_f32(r, a, b);
130 // CHECK-LABEL: @test_vbfmlalbq_f32(
131 // CHECK-NEXT: entry:
132 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
133 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
134 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
135 // CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
136 // CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
137 // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
139 float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
140 return vbfmlalbq_f32(r, a, b);
143 // CHECK-LABEL: @test_vbfmlaltq_f32(
144 // CHECK-NEXT: entry:
145 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
146 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
147 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
148 // CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
149 // CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
150 // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
152 float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
153 return vbfmlaltq_f32(r, a, b);
156 // CHECK-LABEL: @test_vbfmlalbq_lane_f32(
157 // CHECK-NEXT: entry:
158 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0
159 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> undef, bfloat [[VGET_LANE]], i32 0
160 // CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
161 // CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
162 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
163 // CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
164 // CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
165 // CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
166 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
167 // CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
168 // CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
169 // CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
170 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
171 // CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
172 // CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
173 // CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
174 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
175 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
176 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
177 // CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
178 // CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
179 // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
181 float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
182 return vbfmlalbq_lane_f32(r, a, b, 0);
185 // CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
186 // CHECK-NEXT: entry:
187 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3
188 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> undef, bfloat [[VGETQ_LANE]], i32 0
189 // CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
190 // CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1
191 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
192 // CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2
193 // CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
194 // CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3
195 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
196 // CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4
197 // CHECK-NEXT: [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
198 // CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5
199 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
200 // CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6
201 // CHECK-NEXT: [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
202 // CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7
203 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
204 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
205 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
206 // CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
207 // CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
208 // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
210 float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
211 return vbfmlalbq_laneq_f32(r, a, b, 3);
214 // CHECK-LABEL: @test_vbfmlaltq_lane_f32(
215 // CHECK-NEXT: entry:
216 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0
217 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> undef, bfloat [[VGET_LANE]], i32 0
218 // CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
219 // CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
220 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
221 // CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
222 // CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
223 // CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
224 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
225 // CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
226 // CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
227 // CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
228 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
229 // CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
230 // CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
231 // CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
232 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
233 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
234 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
235 // CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
236 // CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
237 // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
239 float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
240 return vbfmlaltq_lane_f32(r, a, b, 0);
243 // CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
244 // CHECK-NEXT: entry:
245 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3
246 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> undef, bfloat [[VGETQ_LANE]], i32 0
247 // CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
248 // CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1
249 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
250 // CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2
251 // CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
252 // CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3
253 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
254 // CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4
255 // CHECK-NEXT: [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
256 // CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5
257 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
258 // CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6
259 // CHECK-NEXT: [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
260 // CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7
261 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
262 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
263 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
264 // CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
265 // CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
266 // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
268 float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
269 return vbfmlaltq_laneq_f32(r, a, b, 3);