1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,+fullfp16 < %s | FileCheck %s
3 ; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops
6 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_bf16(ptr nocapture readonly %ptr) {
7 ; CHECK-LABEL: test_vld1_bf16:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: vld1.16 {d0}, [r0]
12 %0 = load <4 x bfloat>, ptr %ptr, align 2
16 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_bf16(ptr nocapture readonly %ptr) {
17 ; CHECK-LABEL: test_vld1q_bf16:
18 ; CHECK: @ %bb.0: @ %entry
19 ; CHECK-NEXT: vld1.16 {d0, d1}, [r0]
22 %0 = load <8 x bfloat>, ptr %ptr, align 2
26 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_lane_bf16(ptr nocapture readonly %ptr, <4 x bfloat> %src) {
27 ; CHECK-LABEL: test_vld1_lane_bf16:
28 ; CHECK: @ %bb.0: @ %entry
29 ; CHECK-NEXT: vld1.16 {d0[0]}, [r0:16]
32 %0 = load bfloat, ptr %ptr, align 2
33 %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
34 ret <4 x bfloat> %vld1_lane
37 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_lane_bf16(ptr nocapture readonly %ptr, <8 x bfloat> %src) {
38 ; CHECK-LABEL: test_vld1q_lane_bf16:
39 ; CHECK: @ %bb.0: @ %entry
40 ; CHECK-NEXT: vld1.16 {d1[3]}, [r0:16]
43 %0 = load bfloat, ptr %ptr, align 2
44 %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
45 ret <8 x bfloat> %vld1_lane
48 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_dup_bf16(ptr nocapture readonly %ptr) {
49 ; CHECK-LABEL: test_vld1_dup_bf16:
50 ; CHECK: @ %bb.0: @ %entry
51 ; CHECK-NEXT: vld1.16 {d0[]}, [r0:16]
54 %0 = load bfloat, ptr %ptr, align 2
55 %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
56 %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
57 ret <4 x bfloat> %lane
60 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(ptr %ptr) {
61 ; CHECK-LABEL: test_vld1_bf16_x2:
62 ; CHECK: @ %bb.0: @ %entry
63 ; CHECK-NEXT: vld1.16 {d0, d1}, [r0:64]
66 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr %ptr)
67 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
68 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
69 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
70 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
71 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
72 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
73 ret [2 x <2 x i32>] %.fca.1.insert
76 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(ptr %ptr) {
77 ; CHECK-LABEL: test_vld1q_bf16_x2:
78 ; CHECK: @ %bb.0: @ %entry
79 ; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]
82 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr %ptr)
83 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
84 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
85 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
86 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
87 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
88 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
89 ret [2 x <4 x i32>] %.fca.1.insert
92 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(ptr %ptr) {
93 ; CHECK-LABEL: test_vld1_bf16_x3:
94 ; CHECK: @ %bb.0: @ %entry
95 ; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64]
98 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr %ptr)
99 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
100 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
101 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
102 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
103 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
104 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
105 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
106 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
107 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
108 ret [3 x <2 x i32>] %.fca.2.insert
111 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(ptr %ptr) {
112 ; CHECK-LABEL: test_vld1q_bf16_x3:
113 ; CHECK: @ %bb.0: @ %entry
114 ; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64]!
115 ; CHECK-NEXT: vld1.16 {d3, d4, d5}, [r0:64]
118 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr %ptr)
119 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
120 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
121 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
122 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
123 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
124 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
125 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
126 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
127 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
128 ret [3 x <4 x i32>] %.fca.2.insert
131 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(ptr %ptr) {
132 ; CHECK-LABEL: test_vld1_bf16_x4:
133 ; CHECK: @ %bb.0: @ %entry
134 ; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]
137 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr %ptr)
138 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
139 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
140 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
141 %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3
142 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
143 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
144 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
145 %3 = bitcast <4 x bfloat> %vld1xN.fca.3.extract to <2 x i32>
146 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
147 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
148 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
149 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
150 ret [4 x <2 x i32>] %.fca.3.insert
153 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(ptr %ptr) {
154 ; CHECK-LABEL: test_vld1q_bf16_x4:
155 ; CHECK: @ %bb.0: @ %entry
156 ; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]!
157 ; CHECK-NEXT: vld1.16 {d4, d5, d6, d7}, [r0:256]
160 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr %ptr)
161 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
162 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
163 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
164 %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3
165 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
166 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
167 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
168 %3 = bitcast <8 x bfloat> %vld1xN.fca.3.extract to <4 x i32>
169 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
170 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
171 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
172 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
173 ret [4 x <4 x i32>] %.fca.3.insert
176 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_dup_bf16(ptr nocapture readonly %ptr) {
177 ; CHECK-LABEL: test_vld1q_dup_bf16:
178 ; CHECK: @ %bb.0: @ %entry
179 ; CHECK-NEXT: vld1.16 {d0[], d1[]}, [r0:16]
182 %0 = load bfloat, ptr %ptr, align 2
183 %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
184 %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
185 ret <8 x bfloat> %lane
188 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_bf16(ptr %ptr) {
189 ; CHECK-LABEL: test_vld2_bf16:
190 ; CHECK: @ %bb.0: @ %entry
191 ; CHECK-NEXT: vld2.16 {d0, d1}, [r0]
194 %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr %ptr, i32 2)
195 %vld2_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 0
196 %vld2_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 1
197 %0 = bitcast <4 x bfloat> %vld2_v.fca.0.extract to <2 x i32>
198 %1 = bitcast <4 x bfloat> %vld2_v.fca.1.extract to <2 x i32>
199 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
200 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
201 ret [2 x <2 x i32>] %.fca.1.insert
204 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_bf16(ptr %ptr) {
205 ; CHECK-LABEL: test_vld2q_bf16:
206 ; CHECK: @ %bb.0: @ %entry
207 ; CHECK-NEXT: vld2.16 {d0, d1, d2, d3}, [r0]
210 %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr %ptr, i32 2)
211 %vld2q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 0
212 %vld2q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 1
213 %0 = bitcast <8 x bfloat> %vld2q_v.fca.0.extract to <4 x i32>
214 %1 = bitcast <8 x bfloat> %vld2q_v.fca.1.extract to <4 x i32>
215 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
216 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
217 ret [2 x <4 x i32>] %.fca.1.insert
220 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_lane_bf16(ptr %ptr, [2 x <2 x i32>] %src.coerce) {
221 ; CHECK-LABEL: test_vld2_lane_bf16:
222 ; CHECK: @ %bb.0: @ %entry
223 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
224 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
225 ; CHECK-NEXT: vld2.16 {d0[1], d1[1]}, [r0]
228 %src.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %src.coerce, 0
229 %src.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %src.coerce, 1
230 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
231 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
232 %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
233 %vld2_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 0
234 %vld2_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 1
235 %2 = bitcast <4 x bfloat> %vld2_lane_v.fca.0.extract to <2 x i32>
236 %3 = bitcast <4 x bfloat> %vld2_lane_v.fca.1.extract to <2 x i32>
237 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %2, 0
238 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %3, 1
239 ret [2 x <2 x i32>] %.fca.1.insert
242 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_lane_bf16(ptr %ptr, [2 x <4 x i32>] %src.coerce) {
243 ; CHECK-LABEL: test_vld2q_lane_bf16:
244 ; CHECK: @ %bb.0: @ %entry
245 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
246 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
247 ; CHECK-NEXT: vld2.16 {d1[3], d3[3]}, [r0]
250 %src.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %src.coerce, 0
251 %src.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %src.coerce, 1
252 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
253 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
254 %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
255 %vld2q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 0
256 %vld2q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 1
257 %2 = bitcast <8 x bfloat> %vld2q_lane_v.fca.0.extract to <4 x i32>
258 %3 = bitcast <8 x bfloat> %vld2q_lane_v.fca.1.extract to <4 x i32>
259 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %2, 0
260 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %3, 1
261 ret [2 x <4 x i32>] %.fca.1.insert
264 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_bf16(ptr %ptr) {
265 ; CHECK-LABEL: test_vld3_bf16:
266 ; CHECK: @ %bb.0: @ %entry
267 ; CHECK-NEXT: vld3.16 {d0, d1, d2}, [r0]
270 %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr %ptr, i32 2)
271 %vld3_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 0
272 %vld3_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 1
273 %vld3_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 2
274 %0 = bitcast <4 x bfloat> %vld3_v.fca.0.extract to <2 x i32>
275 %1 = bitcast <4 x bfloat> %vld3_v.fca.1.extract to <2 x i32>
276 %2 = bitcast <4 x bfloat> %vld3_v.fca.2.extract to <2 x i32>
277 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
278 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
279 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
280 ret [3 x <2 x i32>] %.fca.2.insert
283 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_bf16(ptr %ptr) {
284 ; CHECK-LABEL: test_vld3q_bf16:
285 ; CHECK: @ %bb.0: @ %entry
286 ; CHECK-NEXT: vld3.16 {d0, d2, d4}, [r0]!
287 ; CHECK-NEXT: vld3.16 {d1, d3, d5}, [r0]
290 %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr %ptr, i32 2)
291 %vld3q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 0
292 %vld3q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 1
293 %vld3q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 2
294 %0 = bitcast <8 x bfloat> %vld3q_v.fca.0.extract to <4 x i32>
295 %1 = bitcast <8 x bfloat> %vld3q_v.fca.1.extract to <4 x i32>
296 %2 = bitcast <8 x bfloat> %vld3q_v.fca.2.extract to <4 x i32>
297 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
298 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
299 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
300 ret [3 x <4 x i32>] %.fca.2.insert
303 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_lane_bf16(ptr %ptr, [3 x <2 x i32>] %src.coerce) {
304 ; CHECK-LABEL: test_vld3_lane_bf16:
305 ; CHECK: @ %bb.0: @ %entry
306 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
307 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
308 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
309 ; CHECK-NEXT: vld3.16 {d0[1], d1[1], d2[1]}, [r0]
312 %src.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %src.coerce, 0
313 %src.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %src.coerce, 1
314 %src.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %src.coerce, 2
315 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
316 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
317 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
318 %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
319 %vld3_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 0
320 %vld3_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 1
321 %vld3_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 2
322 %3 = bitcast <4 x bfloat> %vld3_lane_v.fca.0.extract to <2 x i32>
323 %4 = bitcast <4 x bfloat> %vld3_lane_v.fca.1.extract to <2 x i32>
324 %5 = bitcast <4 x bfloat> %vld3_lane_v.fca.2.extract to <2 x i32>
325 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %3, 0
326 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %4, 1
327 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %5, 2
328 ret [3 x <2 x i32>] %.fca.2.insert
331 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_lane_bf16(ptr %ptr, [3 x <4 x i32>] %src.coerce) {
332 ; CHECK-LABEL: test_vld3q_lane_bf16:
333 ; CHECK: @ %bb.0: @ %entry
334 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
335 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
336 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
337 ; CHECK-NEXT: vld3.16 {d1[3], d3[3], d5[3]}, [r0]
340 %src.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %src.coerce, 0
341 %src.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %src.coerce, 1
342 %src.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %src.coerce, 2
343 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
344 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
345 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
346 %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
347 %vld3q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 0
348 %vld3q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 1
349 %vld3q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 2
350 %3 = bitcast <8 x bfloat> %vld3q_lane_v.fca.0.extract to <4 x i32>
351 %4 = bitcast <8 x bfloat> %vld3q_lane_v.fca.1.extract to <4 x i32>
352 %5 = bitcast <8 x bfloat> %vld3q_lane_v.fca.2.extract to <4 x i32>
353 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %3, 0
354 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %4, 1
355 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %5, 2
356 ret [3 x <4 x i32>] %.fca.2.insert
359 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_bf16(ptr %ptr) {
360 ; CHECK-LABEL: test_vld4_bf16:
361 ; CHECK: @ %bb.0: @ %entry
362 ; CHECK-NEXT: vld4.16 {d0, d1, d2, d3}, [r0]
365 %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr %ptr, i32 2)
366 %vld4_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 0
367 %vld4_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 1
368 %vld4_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 2
369 %vld4_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 3
370 %0 = bitcast <4 x bfloat> %vld4_v.fca.0.extract to <2 x i32>
371 %1 = bitcast <4 x bfloat> %vld4_v.fca.1.extract to <2 x i32>
372 %2 = bitcast <4 x bfloat> %vld4_v.fca.2.extract to <2 x i32>
373 %3 = bitcast <4 x bfloat> %vld4_v.fca.3.extract to <2 x i32>
374 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
375 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
376 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
377 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
378 ret [4 x <2 x i32>] %.fca.3.insert
381 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_bf16(ptr %ptr) {
382 ; CHECK-LABEL: test_vld4q_bf16:
383 ; CHECK: @ %bb.0: @ %entry
384 ; CHECK-NEXT: vld4.16 {d0, d2, d4, d6}, [r0]!
385 ; CHECK-NEXT: vld4.16 {d1, d3, d5, d7}, [r0]
388 %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr %ptr, i32 2)
389 %vld4q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 0
390 %vld4q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 1
391 %vld4q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 2
392 %vld4q_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 3
393 %0 = bitcast <8 x bfloat> %vld4q_v.fca.0.extract to <4 x i32>
394 %1 = bitcast <8 x bfloat> %vld4q_v.fca.1.extract to <4 x i32>
395 %2 = bitcast <8 x bfloat> %vld4q_v.fca.2.extract to <4 x i32>
396 %3 = bitcast <8 x bfloat> %vld4q_v.fca.3.extract to <4 x i32>
397 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
398 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
399 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
400 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
401 ret [4 x <4 x i32>] %.fca.3.insert
404 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_lane_bf16(ptr %ptr, [4 x <2 x i32>] %src.coerce) {
405 ; CHECK-LABEL: test_vld4_lane_bf16:
406 ; CHECK: @ %bb.0: @ %entry
407 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
408 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
409 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
410 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
411 ; CHECK-NEXT: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
414 %src.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %src.coerce, 0
415 %src.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %src.coerce, 1
416 %src.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %src.coerce, 2
417 %src.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %src.coerce, 3
418 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
419 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
420 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
421 %3 = bitcast <2 x i32> %src.coerce.fca.3.extract to <4 x bfloat>
422 %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
423 %vld4_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 0
424 %vld4_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 1
425 %vld4_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 2
426 %vld4_lane_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 3
427 %4 = bitcast <4 x bfloat> %vld4_lane_v.fca.0.extract to <2 x i32>
428 %5 = bitcast <4 x bfloat> %vld4_lane_v.fca.1.extract to <2 x i32>
429 %6 = bitcast <4 x bfloat> %vld4_lane_v.fca.2.extract to <2 x i32>
430 %7 = bitcast <4 x bfloat> %vld4_lane_v.fca.3.extract to <2 x i32>
431 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %4, 0
432 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %5, 1
433 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %6, 2
434 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %7, 3
435 ret [4 x <2 x i32>] %.fca.3.insert
438 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_lane_bf16(ptr %ptr, [4 x <4 x i32>] %src.coerce) {
439 ; CHECK-LABEL: test_vld4q_lane_bf16:
440 ; CHECK: @ %bb.0: @ %entry
441 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
442 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
443 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
444 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
445 ; CHECK-NEXT: vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
448 %src.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %src.coerce, 0
449 %src.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %src.coerce, 1
450 %src.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %src.coerce, 2
451 %src.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %src.coerce, 3
452 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
453 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
454 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
455 %3 = bitcast <4 x i32> %src.coerce.fca.3.extract to <8 x bfloat>
456 %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
457 %vld4q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 0
458 %vld4q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 1
459 %vld4q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 2
460 %vld4q_lane_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 3
461 %4 = bitcast <8 x bfloat> %vld4q_lane_v.fca.0.extract to <4 x i32>
462 %5 = bitcast <8 x bfloat> %vld4q_lane_v.fca.1.extract to <4 x i32>
463 %6 = bitcast <8 x bfloat> %vld4q_lane_v.fca.2.extract to <4 x i32>
464 %7 = bitcast <8 x bfloat> %vld4q_lane_v.fca.3.extract to <4 x i32>
465 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %4, 0
466 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %5, 1
467 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %6, 2
468 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %7, 3
469 ret [4 x <4 x i32>] %.fca.3.insert
472 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_dup_bf16(ptr %ptr) {
473 ; CHECK-LABEL: test_vld2_dup_bf16:
474 ; CHECK: @ %bb.0: @ %entry
475 ; CHECK-NEXT: vld2.16 {d0[], d1[]}, [r0]
478 %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr %ptr, i32 2)
479 %vld2_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 0
480 %vld2_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 1
481 %0 = bitcast <4 x bfloat> %vld2_dup_v.fca.0.extract to <2 x i32>
482 %1 = bitcast <4 x bfloat> %vld2_dup_v.fca.1.extract to <2 x i32>
483 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
484 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
485 ret [2 x <2 x i32>] %.fca.1.insert
488 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(ptr %ptr) {
489 ; CHECK-LABEL: test_vld2q_dup_bf16:
490 ; CHECK: @ %bb.0: @ %entry
491 ; CHECK-NEXT: vld2.16 {d0[], d2[]}, [r0]
492 ; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0]
495 %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr %ptr, i32 2)
496 %vld2q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 0
497 %vld2q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 1
498 %0 = bitcast <8 x bfloat> %vld2q_dup_v.fca.0.extract to <4 x i32>
499 %1 = bitcast <8 x bfloat> %vld2q_dup_v.fca.1.extract to <4 x i32>
500 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
501 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
502 ret [2 x <4 x i32>] %.fca.1.insert
505 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_dup_bf16(ptr %ptr) {
506 ; CHECK-LABEL: test_vld3_dup_bf16:
507 ; CHECK: @ %bb.0: @ %entry
508 ; CHECK-NEXT: vld3.16 {d0[], d1[], d2[]}, [r0]
511 %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr %ptr, i32 2)
512 %vld3_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 0
513 %vld3_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 1
514 %vld3_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 2
515 %0 = bitcast <4 x bfloat> %vld3_dup_v.fca.0.extract to <2 x i32>
516 %1 = bitcast <4 x bfloat> %vld3_dup_v.fca.1.extract to <2 x i32>
517 %2 = bitcast <4 x bfloat> %vld3_dup_v.fca.2.extract to <2 x i32>
518 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
519 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
520 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
521 ret [3 x <2 x i32>] %.fca.2.insert
524 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_dup_bf16(ptr %ptr) {
525 ; CHECK-LABEL: test_vld3q_dup_bf16:
526 ; CHECK: @ %bb.0: @ %entry
527 ; CHECK-NEXT: vld3.16 {d0[], d2[], d4[]}, [r0]
528 ; CHECK-NEXT: vld3.16 {d1[], d3[], d5[]}, [r0]
531 %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr %ptr, i32 2)
532 %vld3q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 0
533 %vld3q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 1
534 %vld3q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 2
535 %0 = bitcast <8 x bfloat> %vld3q_dup_v.fca.0.extract to <4 x i32>
536 %1 = bitcast <8 x bfloat> %vld3q_dup_v.fca.1.extract to <4 x i32>
537 %2 = bitcast <8 x bfloat> %vld3q_dup_v.fca.2.extract to <4 x i32>
538 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
539 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
540 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
541 ret [3 x <4 x i32>] %.fca.2.insert
544 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_dup_bf16(ptr %ptr) {
545 ; CHECK-LABEL: test_vld4_dup_bf16:
546 ; CHECK: @ %bb.0: @ %entry
547 ; CHECK-NEXT: vld4.16 {d0[], d1[], d2[], d3[]}, [r0]
550 %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr %ptr, i32 2)
551 %vld4_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 0
552 %vld4_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 1
553 %vld4_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 2
554 %vld4_dup_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 3
555 %0 = bitcast <4 x bfloat> %vld4_dup_v.fca.0.extract to <2 x i32>
556 %1 = bitcast <4 x bfloat> %vld4_dup_v.fca.1.extract to <2 x i32>
557 %2 = bitcast <4 x bfloat> %vld4_dup_v.fca.2.extract to <2 x i32>
558 %3 = bitcast <4 x bfloat> %vld4_dup_v.fca.3.extract to <2 x i32>
559 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
560 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
561 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
562 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
563 ret [4 x <2 x i32>] %.fca.3.insert
566 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_dup_bf16(ptr %ptr) {
567 ; CHECK-LABEL: test_vld4q_dup_bf16:
568 ; CHECK: @ %bb.0: @ %entry
569 ; CHECK-NEXT: vld4.16 {d0[], d2[], d4[], d6[]}, [r0]
570 ; CHECK-NEXT: vld4.16 {d1[], d3[], d5[], d7[]}, [r0]
573 %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr %ptr, i32 2)
574 %vld4q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 0
575 %vld4q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 1
576 %vld4q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 2
577 %vld4q_dup_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 3
578 %0 = bitcast <8 x bfloat> %vld4q_dup_v.fca.0.extract to <4 x i32>
579 %1 = bitcast <8 x bfloat> %vld4q_dup_v.fca.1.extract to <4 x i32>
580 %2 = bitcast <8 x bfloat> %vld4q_dup_v.fca.2.extract to <4 x i32>
581 %3 = bitcast <8 x bfloat> %vld4q_dup_v.fca.3.extract to <4 x i32>
582 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
583 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
584 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
585 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
586 ret [4 x <4 x i32>] %.fca.3.insert
589 define arm_aapcs_vfpcc void @test_vst1_bf16(ptr %ptr, <4 x bfloat> %val) {
590 ; CHECK-LABEL: test_vst1_bf16:
591 ; CHECK: @ %bb.0: @ %entry
592 ; CHECK-NEXT: vst1.16 {d0}, [r0]
595 tail call void @llvm.arm.neon.vst1.p0.v4bf16(ptr %ptr, <4 x bfloat> %val, i32 2)
599 define arm_aapcs_vfpcc void @test_vst1q_bf16(ptr %ptr, <8 x bfloat> %val) {
600 ; CHECK-LABEL: test_vst1q_bf16:
601 ; CHECK: @ %bb.0: @ %entry
602 ; CHECK-NEXT: vst1.16 {d0, d1}, [r0]
605 tail call void @llvm.arm.neon.vst1.p0.v8bf16(ptr %ptr, <8 x bfloat> %val, i32 2)
609 define arm_aapcs_vfpcc void @test_vst1_lane_bf16(ptr nocapture %ptr, <4 x bfloat> %val) {
610 ; CHECK-LABEL: test_vst1_lane_bf16:
611 ; CHECK: @ %bb.0: @ %entry
612 ; CHECK-NEXT: vmovx.f16 s0, s0
613 ; CHECK-NEXT: vstr.16 s0, [r0]
616 %0 = extractelement <4 x bfloat> %val, i32 1
617 store bfloat %0, ptr %ptr, align 2
621 define arm_aapcs_vfpcc void @test_vst1q_lane_bf16(ptr nocapture %ptr, <8 x bfloat> %val) {
622 ; CHECK-LABEL: test_vst1q_lane_bf16:
623 ; CHECK: @ %bb.0: @ %entry
624 ; CHECK-NEXT: vmovx.f16 s0, s3
625 ; CHECK-NEXT: vstr.16 s0, [r0]
628 %0 = extractelement <8 x bfloat> %val, i32 7
629 store bfloat %0, ptr %ptr, align 2
633 define arm_aapcs_vfpcc void @test_vst1_bf16_x2(ptr nocapture %ptr, [2 x <2 x i32>] %val.coerce) {
634 ; CHECK-LABEL: test_vst1_bf16_x2:
635 ; CHECK: @ %bb.0: @ %entry
636 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
637 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
638 ; CHECK-NEXT: vst1.16 {d0, d1}, [r0:64]
641 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
642 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
643 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
644 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
645 tail call void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
649 define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(ptr nocapture %ptr, [2 x <4 x i32>] %val.coerce) {
650 ; CHECK-LABEL: test_vst1q_bf16_x2:
651 ; CHECK: @ %bb.0: @ %entry
652 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
653 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
654 ; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]
657 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
658 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
659 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
660 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
661 tail call void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
665 define arm_aapcs_vfpcc void @test_vst1_bf16_x3(ptr nocapture %ptr, [3 x <2 x i32>] %val.coerce) {
666 ; CHECK-LABEL: test_vst1_bf16_x3:
667 ; CHECK: @ %bb.0: @ %entry
668 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
669 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
670 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
671 ; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64]
674 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
675 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
676 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
677 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
678 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
679 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
680 tail call void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2)
684 define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(ptr nocapture %ptr, [3 x <4 x i32>] %val.coerce) {
685 ; CHECK-LABEL: test_vst1q_bf16_x3:
686 ; CHECK: @ %bb.0: @ %entry
687 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
688 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
689 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
690 ; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64]!
691 ; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0:64]
694 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
695 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
696 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
697 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
698 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
699 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
700 tail call void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2)
704 define arm_aapcs_vfpcc void @test_vst1_bf16_x4(ptr nocapture %ptr, [4 x <2 x i32>] %val.coerce) {
705 ; CHECK-LABEL: test_vst1_bf16_x4:
706 ; CHECK: @ %bb.0: @ %entry
707 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
708 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
709 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
710 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
711 ; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]
714 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
715 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
716 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
717 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
718 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
719 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
720 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
721 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
722 tail call void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3)
726 define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(ptr nocapture %ptr, [4 x <4 x i32>] %val.coerce) {
727 ; CHECK-LABEL: test_vst1q_bf16_x4:
728 ; CHECK: @ %bb.0: @ %entry
729 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
730 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
731 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
732 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
733 ; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]!
734 ; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0:256]
737 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
738 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
739 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
740 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
741 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
742 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
743 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
744 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
745 tail call void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3)
749 define arm_aapcs_vfpcc void @test_vst2_bf16(ptr %ptr, [2 x <2 x i32>] %val.coerce) {
750 ; CHECK-LABEL: test_vst2_bf16:
751 ; CHECK: @ %bb.0: @ %entry
752 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
753 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
754 ; CHECK-NEXT: vst2.16 {d0, d1}, [r0]
757 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
758 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
759 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
760 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
761 tail call void @llvm.arm.neon.vst2.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
765 define arm_aapcs_vfpcc void @test_vst2q_bf16(ptr %ptr, [2 x <4 x i32>] %val.coerce) {
766 ; CHECK-LABEL: test_vst2q_bf16:
767 ; CHECK: @ %bb.0: @ %entry
768 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
769 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
770 ; CHECK-NEXT: vst2.16 {d0, d1, d2, d3}, [r0]
773 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
774 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
775 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
776 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
777 tail call void @llvm.arm.neon.vst2.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
781 define arm_aapcs_vfpcc void @test_vst2_lane_bf16(ptr %ptr, [2 x <2 x i32>] %val.coerce) {
782 ; CHECK-LABEL: test_vst2_lane_bf16:
783 ; CHECK: @ %bb.0: @ %entry
784 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
785 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
786 ; CHECK-NEXT: vst2.16 {d0[1], d1[1]}, [r0]
789 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
790 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
791 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
792 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
793 tail call void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
797 define arm_aapcs_vfpcc void @test_vst2q_lane_bf16(ptr %ptr, [2 x <4 x i32>] %val.coerce) {
798 ; CHECK-LABEL: test_vst2q_lane_bf16:
799 ; CHECK: @ %bb.0: @ %entry
800 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
801 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
802 ; CHECK-NEXT: vst2.16 {d1[3], d3[3]}, [r0]
805 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
806 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
807 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
808 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
809 tail call void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
813 define arm_aapcs_vfpcc void @test_vst3_bf16(ptr %ptr, [3 x <2 x i32>] %val.coerce) {
814 ; CHECK-LABEL: test_vst3_bf16:
815 ; CHECK: @ %bb.0: @ %entry
816 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
817 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
818 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
819 ; CHECK-NEXT: vst3.16 {d0, d1, d2}, [r0]
822 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
823 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
824 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
825 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
826 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
827 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
828 tail call void @llvm.arm.neon.vst3.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2)
832 define arm_aapcs_vfpcc void @test_vst3q_bf16(ptr %ptr, [3 x <4 x i32>] %val.coerce) {
833 ; CHECK-LABEL: test_vst3q_bf16:
834 ; CHECK: @ %bb.0: @ %entry
835 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
836 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
837 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
838 ; CHECK-NEXT: vst3.16 {d0, d2, d4}, [r0]!
839 ; CHECK-NEXT: vst3.16 {d1, d3, d5}, [r0]
842 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
843 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
844 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
845 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
846 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
847 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
848 tail call void @llvm.arm.neon.vst3.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2)
852 define arm_aapcs_vfpcc void @test_vst3_lane_bf16(ptr %ptr, [3 x <2 x i32>] %val.coerce) {
853 ; CHECK-LABEL: test_vst3_lane_bf16:
854 ; CHECK: @ %bb.0: @ %entry
855 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
856 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
857 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
858 ; CHECK-NEXT: vst3.16 {d0[1], d1[1], d2[1]}, [r0]
861 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
862 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
863 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
864 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
865 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
866 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
867 tail call void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
871 define arm_aapcs_vfpcc void @test_vst3q_lane_bf16(ptr %ptr, [3 x <4 x i32>] %val.coerce) {
872 ; CHECK-LABEL: test_vst3q_lane_bf16:
873 ; CHECK: @ %bb.0: @ %entry
874 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
875 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
876 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
877 ; CHECK-NEXT: vst3.16 {d1[3], d3[3], d5[3]}, [r0]
880 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
881 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
882 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
883 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
884 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
885 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
886 tail call void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
890 define arm_aapcs_vfpcc void @test_vst4_bf16(ptr %ptr, [4 x <2 x i32>] %val.coerce) {
891 ; CHECK-LABEL: test_vst4_bf16:
892 ; CHECK: @ %bb.0: @ %entry
893 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
894 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
895 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
896 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
897 ; CHECK-NEXT: vst4.16 {d0, d1, d2, d3}, [r0]
900 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
901 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
902 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
903 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
904 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
905 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
906 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
907 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
908 tail call void @llvm.arm.neon.vst4.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2)
912 define arm_aapcs_vfpcc void @test_vst4q_bf16(ptr %ptr, [4 x <4 x i32>] %val.coerce) {
913 ; CHECK-LABEL: test_vst4q_bf16:
914 ; CHECK: @ %bb.0: @ %entry
915 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
916 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
917 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
918 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
919 ; CHECK-NEXT: vst4.16 {d0, d2, d4, d6}, [r0]!
920 ; CHECK-NEXT: vst4.16 {d1, d3, d5, d7}, [r0]
923 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
924 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
925 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
926 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
927 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
928 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
929 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
930 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
931 tail call void @llvm.arm.neon.vst4.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2)
935 define arm_aapcs_vfpcc void @test_vst4_lane_bf16(ptr %ptr, [4 x <2 x i32>] %val.coerce) {
936 ; CHECK-LABEL: test_vst4_lane_bf16:
937 ; CHECK: @ %bb.0: @ %entry
938 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
939 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
940 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
941 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
942 ; CHECK-NEXT: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
945 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
946 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
947 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
948 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
949 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
950 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
951 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
952 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
953 tail call void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
957 define arm_aapcs_vfpcc void @test_vst4q_lane_bf16(ptr %ptr, [4 x <4 x i32>] %val.coerce) {
958 ; CHECK-LABEL: test_vst4q_lane_bf16:
959 ; CHECK: @ %bb.0: @ %entry
960 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
961 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
962 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
963 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
964 ; CHECK-NEXT: vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
967 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
968 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
969 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
970 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
971 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
972 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
973 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
974 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
975 tail call void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
979 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr, i32)
980 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr, i32)
981 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr, i32)
982 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr, i32)
983 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr, i32)
984 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr, i32)
986 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr, i32)
987 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr, i32)
988 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr, i32)
989 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr, i32)
990 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr, i32)
991 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr, i32)
993 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr)
994 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr)
995 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr)
996 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr)
997 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr)
998 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr)
1000 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, i32, i32)
1001 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, i32, i32)
1002 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1003 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1004 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1005 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1007 declare void @llvm.arm.neon.vst1.p0.v4bf16(ptr, <4 x bfloat>, i32)
1008 declare void @llvm.arm.neon.vst1.p0.v8bf16(ptr, <8 x bfloat>, i32)
1009 declare void @llvm.arm.neon.vst2.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, i32)
1010 declare void @llvm.arm.neon.vst2.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, i32)
1011 declare void @llvm.arm.neon.vst3.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1012 declare void @llvm.arm.neon.vst3.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1013 declare void @llvm.arm.neon.vst4.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1014 declare void @llvm.arm.neon.vst4.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1016 declare void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>)
1017 declare void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>)
1018 declare void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1019 declare void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1020 declare void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1021 declare void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1023 declare void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, i32, i32)
1024 declare void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, i32, i32)
1025 declare void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1026 declare void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1027 declare void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1028 declare void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)