1 ; RUN: llc -mtriple aarch64 -asm-verbose=1 -mattr=+bf16 %s -o - | FileCheck %s
3 %struct.bfloat16x4x2_t = type { [2 x <4 x bfloat>] }
4 %struct.bfloat16x8x2_t = type { [2 x <8 x bfloat>] }
5 %struct.bfloat16x4x3_t = type { [3 x <4 x bfloat>] }
6 %struct.bfloat16x8x3_t = type { [3 x <8 x bfloat>] }
7 %struct.bfloat16x4x4_t = type { [4 x <4 x bfloat>] }
8 %struct.bfloat16x8x4_t = type { [4 x <8 x bfloat>] }
10 define <4 x bfloat> @test_vld1_bf16(ptr nocapture readonly %ptr) local_unnamed_addr nounwind {
11 ; CHECK-LABEL: test_vld1_bf16:
12 ; CHECK: // %bb.0: // %entry
13 ; CHECK-NEXT: ldr d0, [x0]
16 %0 = load <4 x bfloat>, ptr %ptr, align 2
20 define <8 x bfloat> @test_vld1q_bf16(ptr nocapture readonly %ptr) local_unnamed_addr nounwind {
21 ; CHECK-LABEL: test_vld1q_bf16:
22 ; CHECK: // %bb.0: // %entry
23 ; CHECK-NEXT: ldr q0, [x0]
26 %0 = load <8 x bfloat>, ptr %ptr, align 2
30 define <4 x bfloat> @test_vld1_lane_bf16(ptr nocapture readonly %ptr, <4 x bfloat> %src) local_unnamed_addr nounwind {
31 ; CHECK-LABEL: test_vld1_lane_bf16:
32 ; CHECK: // %bb.0: // %entry
33 ; CHECK: ld1 { v0.h }[0], [x0]
36 %0 = load bfloat, ptr %ptr, align 2
37 %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
38 ret <4 x bfloat> %vld1_lane
41 define <8 x bfloat> @test_vld1q_lane_bf16(ptr nocapture readonly %ptr, <8 x bfloat> %src) local_unnamed_addr nounwind {
42 ; CHECK-LABEL: test_vld1q_lane_bf16:
43 ; CHECK: // %bb.0: // %entry
44 ; CHECK-NEXT: ld1 { v0.h }[7], [x0]
47 %0 = load bfloat, ptr %ptr, align 2
48 %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
49 ret <8 x bfloat> %vld1_lane
52 define <4 x bfloat> @test_vld1_dup_bf16(ptr nocapture readonly %ptr) local_unnamed_addr nounwind {
53 ; CHECK-LABEL: test_vld1_dup_bf16:
54 ; CHECK: // %bb.0: // %entry
55 ; CHECK-NEXT: ld1r { v0.4h }, [x0]
58 %0 = load bfloat, ptr %ptr, align 2
59 %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
60 %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
61 ret <4 x bfloat> %lane
64 define %struct.bfloat16x4x2_t @test_vld1_bf16_x2(ptr %ptr) local_unnamed_addr nounwind {
65 ; CHECK-LABEL: test_vld1_bf16_x2:
66 ; CHECK: // %bb.0: // %entry
67 ; CHECK-NEXT: ld1 { v0.4h, v1.4h }, [x0]
70 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0(ptr %ptr)
71 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
72 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
73 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
74 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
75 ret %struct.bfloat16x4x2_t %.fca.0.1.insert
78 declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0(ptr) nounwind
80 define %struct.bfloat16x8x2_t @test_vld1q_bf16_x2(ptr %ptr) local_unnamed_addr nounwind {
81 ; CHECK-LABEL: test_vld1q_bf16_x2:
82 ; CHECK: // %bb.0: // %entry
83 ; CHECK-NEXT: ld1 { v0.8h, v1.8h }, [x0]
86 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0(ptr %ptr)
87 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
88 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
89 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
90 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
91 ret %struct.bfloat16x8x2_t %.fca.0.1.insert
94 ; Function Attrs: argmemonly nounwind readonly
95 declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0(ptr) nounwind
97 define %struct.bfloat16x4x3_t @test_vld1_bf16_x3(ptr %ptr) local_unnamed_addr nounwind {
98 ; CHECK-LABEL: test_vld1_bf16_x3:
99 ; CHECK: // %bb.0: // %entry
100 ; CHECK-NEXT: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
103 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0(ptr %ptr)
104 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
105 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
106 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
107 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
108 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
109 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2
110 ret %struct.bfloat16x4x3_t %.fca.0.2.insert
113 ; Function Attrs: argmemonly nounwind readonly
114 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0(ptr) nounwind
116 define %struct.bfloat16x8x3_t @test_vld1q_bf16_x3(ptr %ptr) local_unnamed_addr nounwind {
117 ; CHECK-LABEL: test_vld1q_bf16_x3:
118 ; CHECK: // %bb.0: // %entry
119 ; CHECK-NEXT: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
122 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0(ptr %ptr)
123 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
124 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
125 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
126 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
127 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
128 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2
129 ret %struct.bfloat16x8x3_t %.fca.0.2.insert
132 ; Function Attrs: argmemonly nounwind readonly
133 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0(ptr) nounwind
135 define %struct.bfloat16x4x4_t @test_vld1_bf16_x4(ptr %ptr) local_unnamed_addr nounwind {
136 ; CHECK-LABEL: test_vld1_bf16_x4:
137 ; CHECK: // %bb.0: // %entry
138 ; CHECK-NEXT: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
141 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0(ptr %ptr)
142 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
143 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
144 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
145 %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3
146 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
147 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
148 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2
149 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld1xN.fca.3.extract, 0, 3
150 ret %struct.bfloat16x4x4_t %.fca.0.3.insert
153 ; Function Attrs: argmemonly nounwind readonly
154 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0(ptr) nounwind
156 define %struct.bfloat16x8x4_t @test_vld1q_bf16_x4(ptr %ptr) local_unnamed_addr nounwind {
157 ; CHECK-LABEL: test_vld1q_bf16_x4:
158 ; CHECK: // %bb.0: // %entry
159 ; CHECK-NEXT: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
162 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0(ptr %ptr)
163 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
164 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
165 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
166 %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3
167 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
168 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
169 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2
170 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld1xN.fca.3.extract, 0, 3
171 ret %struct.bfloat16x8x4_t %.fca.0.3.insert
174 ; Function Attrs: argmemonly nounwind readonly
175 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0(ptr) nounwind
177 define <8 x bfloat> @test_vld1q_dup_bf16(ptr nocapture readonly %ptr) local_unnamed_addr nounwind {
178 ; CHECK-LABEL: test_vld1q_dup_bf16:
179 ; CHECK: // %bb.0: // %entry
180 ; CHECK-NEXT: ld1r { v0.8h }, [x0]
183 %0 = load bfloat, ptr %ptr, align 2
184 %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
185 %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
186 ret <8 x bfloat> %lane
189 define %struct.bfloat16x4x2_t @test_vld2_bf16(ptr %ptr) local_unnamed_addr nounwind {
190 ; CHECK-LABEL: test_vld2_bf16:
191 ; CHECK: // %bb.0: // %entry
192 ; CHECK-NEXT: ld2 { v0.4h, v1.4h }, [x0]
195 %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0(ptr %ptr)
196 %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0
197 %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1
198 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0
199 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1
200 ret %struct.bfloat16x4x2_t %.fca.0.1.insert
203 ; Function Attrs: argmemonly nounwind readonly
204 declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0(ptr) nounwind
206 define %struct.bfloat16x8x2_t @test_vld2q_bf16(ptr %ptr) local_unnamed_addr nounwind {
207 ; CHECK-LABEL: test_vld2q_bf16:
208 ; CHECK: // %bb.0: // %entry
209 ; CHECK-NEXT: ld2 { v0.8h, v1.8h }, [x0]
212 %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0(ptr %ptr)
213 %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0
214 %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1
215 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0
216 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1
217 ret %struct.bfloat16x8x2_t %.fca.0.1.insert
220 ; Function Attrs: argmemonly nounwind readonly
221 declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0(ptr) nounwind
222 define %struct.bfloat16x4x2_t @test_vld2_lane_bf16(ptr %ptr, [2 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
223 ; CHECK-LABEL: test_vld2_lane_bf16:
224 ; CHECK: // %bb.0: // %entry
225 ; CHECK: ld2 { v0.h, v1.h }[1], [x0]
228 %src.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 0
229 %src.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 1
230 %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, ptr %ptr)
231 %vld2_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 0
232 %vld2_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 1
233 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2_lane.fca.0.extract, 0, 0
234 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2_lane.fca.1.extract, 0, 1
235 ret %struct.bfloat16x4x2_t %.fca.0.1.insert
238 ; Function Attrs: argmemonly nounwind readonly
239 declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, i64, ptr) nounwind
241 define %struct.bfloat16x8x2_t @test_vld2q_lane_bf16(ptr %ptr, [2 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
242 ; CHECK-LABEL: test_vld2q_lane_bf16:
243 ; CHECK: // %bb.0: // %entry
244 ; CHECK: ld2 { v0.h, v1.h }[7], [x0]
247 %src.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 0
248 %src.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 1
249 %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, ptr %ptr)
250 %vld2_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 0
251 %vld2_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 1
252 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2_lane.fca.0.extract, 0, 0
253 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2_lane.fca.1.extract, 0, 1
254 ret %struct.bfloat16x8x2_t %.fca.0.1.insert
257 ; Function Attrs: argmemonly nounwind readonly
258 declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, i64, ptr) nounwind
260 define %struct.bfloat16x4x3_t @test_vld3_bf16(ptr %ptr) local_unnamed_addr nounwind {
261 ; CHECK-LABEL: test_vld3_bf16:
262 ; CHECK: // %bb.0: // %entry
263 ; CHECK-NEXT: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
266 %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0(ptr %ptr)
267 %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0
268 %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1
269 %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2
270 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0
271 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1
272 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2
273 ret %struct.bfloat16x4x3_t %.fca.0.2.insert
276 ; Function Attrs: argmemonly nounwind readonly
277 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0(ptr) nounwind
279 define %struct.bfloat16x8x3_t @test_vld3q_bf16(ptr %ptr) local_unnamed_addr nounwind {
280 ; CHECK-LABEL: test_vld3q_bf16:
281 ; CHECK: // %bb.0: // %entry
282 ; CHECK-NEXT: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
285 %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0(ptr %ptr)
286 %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0
287 %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1
288 %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2
289 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0
290 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1
291 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2
292 ret %struct.bfloat16x8x3_t %.fca.0.2.insert
295 ; Function Attrs: argmemonly nounwind readonly
296 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0(ptr) nounwind
298 define %struct.bfloat16x4x3_t @test_vld3_lane_bf16(ptr %ptr, [3 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
299 ; CHECK-LABEL: test_vld3_lane_bf16:
300 ; CHECK: // %bb.0: // %entry
301 ; CHECK: ld3 { v0.h, v1.h, v2.h }[1], [x0]
304 %src.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 0
305 %src.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 1
306 %src.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 2
307 %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, ptr %ptr)
308 %vld3_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 0
309 %vld3_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 1
310 %vld3_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 2
311 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3_lane.fca.0.extract, 0, 0
312 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3_lane.fca.1.extract, 0, 1
313 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3_lane.fca.2.extract, 0, 2
314 ret %struct.bfloat16x4x3_t %.fca.0.2.insert
317 ; Function Attrs: argmemonly nounwind readonly
318 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, ptr) nounwind
320 define %struct.bfloat16x8x3_t @test_vld3q_lane_bf16(ptr %ptr, [3 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
321 ; CHECK-LABEL: test_vld3q_lane_bf16:
322 ; CHECK: // %bb.0: // %entry
323 ; CHECKT: ld3 { v0.h, v1.h, v2.h }[7], [x0]
326 %src.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 0
327 %src.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 1
328 %src.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 2
329 %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, ptr %ptr)
330 %vld3_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 0
331 %vld3_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 1
332 %vld3_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 2
333 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3_lane.fca.0.extract, 0, 0
334 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3_lane.fca.1.extract, 0, 1
335 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3_lane.fca.2.extract, 0, 2
336 ret %struct.bfloat16x8x3_t %.fca.0.2.insert
339 ; Function Attrs: argmemonly nounwind readonly
340 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, ptr) nounwind
342 define %struct.bfloat16x4x4_t @test_vld4_bf16(ptr %ptr) local_unnamed_addr nounwind {
343 ; CHECK-LABEL: test_vld4_bf16:
344 ; CHECK: // %bb.0: // %entry
345 ; CHECK-NEXT: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
348 %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0(ptr %ptr)
349 %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0
350 %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1
351 %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2
352 %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3
353 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0
354 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1
355 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2
356 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3
357 ret %struct.bfloat16x4x4_t %.fca.0.3.insert
360 ; Function Attrs: argmemonly nounwind readonly
361 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0(ptr) nounwind
363 define %struct.bfloat16x8x4_t @test_vld4q_bf16(ptr %ptr) local_unnamed_addr nounwind {
364 ; CHECK-LABEL: test_vld4q_bf16:
365 ; CHECK: // %bb.0: // %entry
366 ; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
369 %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0(ptr %ptr)
370 %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0
371 %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1
372 %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2
373 %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3
374 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0
375 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1
376 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2
377 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3
378 ret %struct.bfloat16x8x4_t %.fca.0.3.insert
381 ; Function Attrs: argmemonly nounwind readonly
382 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0(ptr) nounwind
384 define %struct.bfloat16x4x4_t @test_vld4_lane_bf16(ptr %ptr, [4 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
385 ; CHECK-LABEL: test_vld4_lane_bf16:
386 ; CHECK: // %bb.0: // %entry
387 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x0]
390 %src.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 0
391 %src.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 1
392 %src.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 2
393 %src.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 3
394 %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, ptr %ptr)
395 %vld4_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 0
396 %vld4_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 1
397 %vld4_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 2
398 %vld4_lane.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 3
399 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4_lane.fca.0.extract, 0, 0
400 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4_lane.fca.1.extract, 0, 1
401 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4_lane.fca.2.extract, 0, 2
402 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4_lane.fca.3.extract, 0, 3
403 ret %struct.bfloat16x4x4_t %.fca.0.3.insert
406 ; Function Attrs: argmemonly nounwind readonly
407 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, ptr) nounwind
409 define %struct.bfloat16x8x4_t @test_vld4q_lane_bf16(ptr %ptr, [4 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
410 ; CHECK-LABEL: test_vld4q_lane_bf16:
411 ; CHECK: // %bb.0: // %entry
412 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0]
415 %src.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 0
416 %src.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 1
417 %src.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 2
418 %src.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 3
419 %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, ptr %ptr)
420 %vld4_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 0
421 %vld4_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 1
422 %vld4_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 2
423 %vld4_lane.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 3
424 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4_lane.fca.0.extract, 0, 0
425 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4_lane.fca.1.extract, 0, 1
426 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4_lane.fca.2.extract, 0, 2
427 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4_lane.fca.3.extract, 0, 3
428 ret %struct.bfloat16x8x4_t %.fca.0.3.insert
431 ; Function Attrs: argmemonly nounwind readonly
432 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, ptr) nounwind
434 define %struct.bfloat16x4x2_t @test_vld2_dup_bf16(ptr %ptr) local_unnamed_addr nounwind {
435 ; CHECK-LABEL: test_vld2_dup_bf16:
436 ; CHECK: // %bb.0: // %entry
437 ; CHECK-NEXT: ld2r { v0.4h, v1.4h }, [x0]
440 %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0(ptr %ptr)
441 %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0
442 %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1
443 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0
444 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1
445 ret %struct.bfloat16x4x2_t %.fca.0.1.insert
448 ; Function Attrs: argmemonly nounwind readonly
449 declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0(ptr) nounwind
451 define %struct.bfloat16x8x2_t @test_vld2q_dup_bf16(ptr %ptr) local_unnamed_addr nounwind {
452 ; CHECK-LABEL: test_vld2q_dup_bf16:
453 ; CHECK: // %bb.0: // %entry
454 ; CHECK-NEXT: ld2r { v0.8h, v1.8h }, [x0]
457 %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0(ptr %ptr)
458 %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0
459 %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1
460 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0
461 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1
462 ret %struct.bfloat16x8x2_t %.fca.0.1.insert
465 ; Function Attrs: argmemonly nounwind readonly
466 declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0(ptr) nounwind
468 define %struct.bfloat16x4x3_t @test_vld3_dup_bf16(ptr %ptr) local_unnamed_addr nounwind {
469 ; CHECK-LABEL: test_vld3_dup_bf16:
470 ; CHECK: // %bb.0: // %entry
471 ; CHECK-NEXT: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
474 %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0(ptr %ptr)
475 %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0
476 %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1
477 %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2
478 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0
479 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1
480 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2
481 ret %struct.bfloat16x4x3_t %.fca.0.2.insert
484 ; Function Attrs: argmemonly nounwind readonly
485 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0(ptr) nounwind
487 define %struct.bfloat16x8x3_t @test_vld3q_dup_bf16(ptr %ptr) local_unnamed_addr nounwind {
488 ; CHECK-LABEL: test_vld3q_dup_bf16:
489 ; CHECK: // %bb.0: // %entry
490 ; CHECK-NEXT: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
493 %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0(ptr %ptr)
494 %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0
495 %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1
496 %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2
497 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0
498 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1
499 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2
500 ret %struct.bfloat16x8x3_t %.fca.0.2.insert
503 ; Function Attrs: argmemonly nounwind readonly
504 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0(ptr) nounwind
506 define %struct.bfloat16x4x4_t @test_vld4_dup_bf16(ptr %ptr) local_unnamed_addr nounwind {
507 ; CHECK-LABEL: test_vld4_dup_bf16:
508 ; CHECK: // %bb.0: // %entry
509 ; CHECK-NEXT: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
512 %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0(ptr %ptr)
513 %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0
514 %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1
515 %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2
516 %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3
517 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0
518 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1
519 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2
520 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3
521 ret %struct.bfloat16x4x4_t %.fca.0.3.insert
524 ; Function Attrs: argmemonly nounwind readonly
525 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0(ptr) nounwind
527 define %struct.bfloat16x8x4_t @test_vld4q_dup_bf16(ptr %ptr) local_unnamed_addr nounwind {
528 ; CHECK-LABEL: test_vld4q_dup_bf16:
529 ; CHECK: // %bb.0: // %entry
530 ; CHECK-NEXT: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
533 %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0(ptr %ptr)
534 %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0
535 %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1
536 %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2
537 %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3
538 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0
539 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1
540 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2
541 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3
542 ret %struct.bfloat16x8x4_t %.fca.0.3.insert
545 ; Function Attrs: argmemonly nounwind readonly
546 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0(ptr) nounwind
548 define void @test_vst1_bf16(ptr nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind {
549 ; CHECK-LABEL: test_vst1_bf16:
550 ; CHECK: // %bb.0: // %entry
551 ; CHECK-NEXT: str d0, [x0]
554 store <4 x bfloat> %val, ptr %ptr, align 8
558 define void @test_vst1q_bf16(ptr nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind {
559 ; CHECK-LABEL: test_vst1q_bf16:
560 ; CHECK: // %bb.0: // %entry
561 ; CHECK-NEXT: str q0, [x0]
564 store <8 x bfloat> %val, ptr %ptr, align 16
568 define void @test_vst1_lane_bf16(ptr nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind {
569 ; CHECK-LABEL: test_vst1_lane_bf16:
570 ; CHECK: // %bb.0: // %entry
571 ; CHECK: st1 { v0.h }[1], [x0]
574 %0 = extractelement <4 x bfloat> %val, i32 1
575 store bfloat %0, ptr %ptr, align 2
579 define void @test_vst1q_lane_bf16(ptr nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind {
580 ; CHECK-LABEL: test_vst1q_lane_bf16:
581 ; CHECK: // %bb.0: // %entry
582 ; CHECK-NEXT: st1 { v0.h }[7], [x0]
585 %0 = extractelement <8 x bfloat> %val, i32 7
586 store bfloat %0, ptr %ptr, align 2
590 define void @test_vst1_bf16_x2(ptr nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
591 ; CHECK-LABEL: test_vst1_bf16_x2:
592 ; CHECK: // %bb.0: // %entry
593 ; CHECK: st1 { v0.4h, v1.4h }, [x0]
596 %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
597 %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
598 tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, ptr %ptr)
602 ; Function Attrs: argmemonly nounwind
603 declare void @llvm.aarch64.neon.st1x2.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, ptr nocapture) nounwind
605 define void @test_vst1q_bf16_x2(ptr nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
606 ; CHECK-LABEL: test_vst1q_bf16_x2:
607 ; CHECK: // %bb.0: // %entry
608 ; CHECK: st1 { v0.8h, v1.8h }, [x0]
611 %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
612 %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
613 tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, ptr %ptr)
617 ; Function Attrs: argmemonly nounwind
618 declare void @llvm.aarch64.neon.st1x2.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, ptr nocapture) nounwind
620 define void @test_vst1_bf16_x3(ptr nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
621 ; CHECK-LABEL: test_vst1_bf16_x3:
622 ; CHECK: // %bb.0: // %entry
623 ; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
626 %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
627 %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
628 %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
629 tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, ptr %ptr)
633 ; Function Attrs: argmemonly nounwind
634 declare void @llvm.aarch64.neon.st1x3.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, ptr nocapture) nounwind
636 define void @test_vst1q_bf16_x3(ptr nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
637 ; CHECK-LABEL: test_vst1q_bf16_x3:
638 ; CHECK: // %bb.0: // %entry
639 ; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
642 %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
643 %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
644 %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
645 tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, ptr %ptr)
649 ; Function Attrs: argmemonly nounwind
650 declare void @llvm.aarch64.neon.st1x3.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, ptr nocapture) nounwind
652 ; Function Attrs: nounwind
653 define void @test_vst1_bf16_x4(ptr nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
654 ; CHECK-LABEL: test_vst1_bf16_x4:
655 ; CHECK: // %bb.0: // %entry
656 ; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
659 %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
660 %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
661 %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
662 %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
663 tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, ptr %ptr)
667 ; Function Attrs: argmemonly nounwind
668 declare void @llvm.aarch64.neon.st1x4.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, ptr nocapture) nounwind
670 define void @test_vst1q_bf16_x4(ptr nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
671 ; CHECK-LABEL: test_vst1q_bf16_x4:
672 ; CHECK: // %bb.0: // %entry
673 ; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
676 %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
677 %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
678 %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
679 %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
680 tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, ptr %ptr)
684 ; Function Attrs: argmemonly nounwind
685 declare void @llvm.aarch64.neon.st1x4.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, ptr nocapture) nounwind
687 define void @test_vst2_bf16(ptr nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
688 ; CHECK-LABEL: test_vst2_bf16:
689 ; CHECK: // %bb.0: // %entry
690 ; CHECK: st2 { v0.4h, v1.4h }, [x0]
693 %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
694 %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
695 tail call void @llvm.aarch64.neon.st2.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, ptr %ptr)
699 ; Function Attrs: argmemonly nounwind
700 declare void @llvm.aarch64.neon.st2.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, ptr nocapture) nounwind
702 define void @test_vst2q_bf16(ptr nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
703 ; CHECK-LABEL: test_vst2q_bf16:
704 ; CHECK: // %bb.0: // %entry
705 ; CHECK: st2 { v0.8h, v1.8h }, [x0]
708 %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
709 %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
710 tail call void @llvm.aarch64.neon.st2.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, ptr %ptr)
714 ; Function Attrs: argmemonly nounwind
715 declare void @llvm.aarch64.neon.st2.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, ptr nocapture) nounwind
717 define void @test_vst2_lane_bf16(ptr nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
718 ; CHECK-LABEL: test_vst2_lane_bf16:
719 ; CHECK: // %bb.0: // %entry
720 ; CHECK: st2 { v0.h, v1.h }[1], [x0]
723 %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
724 %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
725 tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, ptr %ptr)
729 ; Function Attrs: argmemonly nounwind
730 declare void @llvm.aarch64.neon.st2lane.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, i64, ptr nocapture) nounwind
732 ; Function Attrs: nounwind
733 define void @test_vst2q_lane_bf16(ptr nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
734 ; CHECK-LABEL: test_vst2q_lane_bf16:
735 ; CHECK: // %bb.0: // %entry
736 ; CHECK: st2 { v0.h, v1.h }[7], [x0]
739 %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
740 %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
741 tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, ptr %ptr)
745 ; Function Attrs: argmemonly nounwind
746 declare void @llvm.aarch64.neon.st2lane.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, i64, ptr nocapture) nounwind
748 ; Function Attrs: nounwind
749 define void @test_vst3_bf16(ptr nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
750 ; CHECK-LABEL: test_vst3_bf16:
751 ; CHECK: // %bb.0: // %entry
752 ; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
755 %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
756 %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
757 %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
758 tail call void @llvm.aarch64.neon.st3.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, ptr %ptr)
762 ; Function Attrs: argmemonly nounwind
763 declare void @llvm.aarch64.neon.st3.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, ptr nocapture) nounwind
765 ; Function Attrs: nounwind
766 define void @test_vst3q_bf16(ptr nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
767 ; CHECK-LABEL: test_vst3q_bf16:
768 ; CHECK: // %bb.0: // %entry
769 ; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
772 %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
773 %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
774 %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
775 tail call void @llvm.aarch64.neon.st3.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, ptr %ptr)
779 ; Function Attrs: argmemonly nounwind
780 declare void @llvm.aarch64.neon.st3.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, ptr nocapture) nounwind
782 ; Function Attrs: nounwind
783 define void @test_vst3_lane_bf16(ptr nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
784 ; CHECK-LABEL: test_vst3_lane_bf16:
785 ; CHECK: // %bb.0: // %entry
786 ; CHECK: st3 { v0.h, v1.h, v2.h }[1], [x0]
789 %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
790 %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
791 %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
792 tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, ptr %ptr)
796 ; Function Attrs: argmemonly nounwind
797 declare void @llvm.aarch64.neon.st3lane.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, ptr nocapture) nounwind
799 ; Function Attrs: nounwind
800 define void @test_vst3q_lane_bf16(ptr nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
801 ; CHECK-LABEL: test_vst3q_lane_bf16:
802 ; CHECK: // %bb.0: // %entry
803 ; CHECK: st3 { v0.h, v1.h, v2.h }[7], [x0]
806 %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
807 %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
808 %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
809 tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, ptr %ptr)
813 ; Function Attrs: argmemonly nounwind
814 declare void @llvm.aarch64.neon.st3lane.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, ptr nocapture) nounwind
816 ; Function Attrs: nounwind
817 define void @test_vst4_bf16(ptr nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
818 ; CHECK-LABEL: test_vst4_bf16:
819 ; CHECK: // %bb.0: // %entry
820 ; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
823 %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
824 %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
825 %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
826 %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
827 tail call void @llvm.aarch64.neon.st4.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, ptr %ptr)
831 ; Function Attrs: argmemonly nounwind
832 declare void @llvm.aarch64.neon.st4.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, ptr nocapture) nounwind
834 ; Function Attrs: nounwind
835 define void @test_vst4q_bf16(ptr nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
836 ; CHECK-LABEL: test_vst4q_bf16:
837 ; CHECK: // %bb.0: // %entry
838 ; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
841 %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
842 %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
843 %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
844 %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
845 tail call void @llvm.aarch64.neon.st4.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, ptr %ptr)
849 ; Function Attrs: argmemonly nounwind
850 declare void @llvm.aarch64.neon.st4.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, ptr nocapture) nounwind
852 ; Function Attrs: nounwind
853 define void @test_vst4_lane_bf16(ptr nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
854 ; CHECK-LABEL: test_vst4_lane_bf16:
855 ; CHECK: // %bb.0: // %entry
856 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[1], [x0]
859 %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
860 %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
861 %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
862 %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
863 tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, ptr %ptr)
867 ; Function Attrs: argmemonly nounwind
868 declare void @llvm.aarch64.neon.st4lane.v4bf16.p0(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, ptr nocapture) nounwind
870 ; Function Attrs: nounwind
871 define void @test_vst4q_lane_bf16(ptr nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
872 ; CHECK-LABEL: test_vst4q_lane_bf16:
873 ; CHECK: // %bb.0: // %entry
874 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[7], [x0]
877 %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
878 %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
879 %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
880 %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
881 tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, ptr %ptr)
885 ; Function Attrs: argmemonly nounwind
886 declare void @llvm.aarch64.neon.st4lane.v8bf16.p0(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, ptr nocapture) nounwind