1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,+fullfp16 < %s | FileCheck %s
3 ; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops
6 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) {
7 ; CHECK-LABEL: test_vld1_bf16:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: vld1.16 {d0}, [r0]
12 %0 = bitcast bfloat* %ptr to <4 x bfloat>*
13 %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
17 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) {
18 ; CHECK-LABEL: test_vld1q_bf16:
19 ; CHECK: @ %bb.0: @ %entry
20 ; CHECK-NEXT: vld1.16 {d0, d1}, [r0]
23 %0 = bitcast bfloat* %ptr to <8 x bfloat>*
24 %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
28 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) {
29 ; CHECK-LABEL: test_vld1_lane_bf16:
30 ; CHECK: @ %bb.0: @ %entry
31 ; CHECK-NEXT: vld1.16 {d0[0]}, [r0:16]
34 %0 = load bfloat, bfloat* %ptr, align 2
35 %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
36 ret <4 x bfloat> %vld1_lane
39 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) {
40 ; CHECK-LABEL: test_vld1q_lane_bf16:
41 ; CHECK: @ %bb.0: @ %entry
42 ; CHECK-NEXT: vld1.16 {d1[3]}, [r0:16]
45 %0 = load bfloat, bfloat* %ptr, align 2
46 %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
47 ret <8 x bfloat> %vld1_lane
50 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) {
51 ; CHECK-LABEL: test_vld1_dup_bf16:
52 ; CHECK: @ %bb.0: @ %entry
53 ; CHECK-NEXT: vld1.16 {d0[]}, [r0:16]
56 %0 = load bfloat, bfloat* %ptr, align 2
57 %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
58 %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
59 ret <4 x bfloat> %lane
62 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(bfloat* %ptr) {
63 ; CHECK-LABEL: test_vld1_bf16_x2:
64 ; CHECK: @ %bb.0: @ %entry
65 ; CHECK-NEXT: vld1.16 {d0, d1}, [r0:64]
68 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr)
69 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
70 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
71 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
72 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
73 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0
74 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
75 ret [2 x <2 x i32>] %.fca.1.insert
78 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(bfloat* %ptr) {
79 ; CHECK-LABEL: test_vld1q_bf16_x2:
80 ; CHECK: @ %bb.0: @ %entry
81 ; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]
84 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr)
85 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
86 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
87 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
88 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
89 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0
90 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
91 ret [2 x <4 x i32>] %.fca.1.insert
94 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(bfloat* %ptr) {
95 ; CHECK-LABEL: test_vld1_bf16_x3:
96 ; CHECK: @ %bb.0: @ %entry
97 ; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64]
100 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr)
101 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
102 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
103 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
104 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
105 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
106 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
107 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0
108 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
109 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
110 ret [3 x <2 x i32>] %.fca.2.insert
113 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(bfloat* %ptr) {
114 ; CHECK-LABEL: test_vld1q_bf16_x3:
115 ; CHECK: @ %bb.0: @ %entry
116 ; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64]!
117 ; CHECK-NEXT: vld1.16 {d3, d4, d5}, [r0:64]
120 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr)
121 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
122 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
123 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
124 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
125 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
126 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
127 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0
128 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
129 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
130 ret [3 x <4 x i32>] %.fca.2.insert
133 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(bfloat* %ptr) {
134 ; CHECK-LABEL: test_vld1_bf16_x4:
135 ; CHECK: @ %bb.0: @ %entry
136 ; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]
139 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr)
140 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
141 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
142 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
143 %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3
144 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
145 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
146 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
147 %3 = bitcast <4 x bfloat> %vld1xN.fca.3.extract to <2 x i32>
148 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0
149 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1
150 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2
151 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3
152 ret [4 x <2 x i32>] %.fca.3.insert
155 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(bfloat* %ptr) {
156 ; CHECK-LABEL: test_vld1q_bf16_x4:
157 ; CHECK: @ %bb.0: @ %entry
158 ; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]!
159 ; CHECK-NEXT: vld1.16 {d4, d5, d6, d7}, [r0:256]
162 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr)
163 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
164 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
165 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
166 %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3
167 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
168 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
169 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
170 %3 = bitcast <8 x bfloat> %vld1xN.fca.3.extract to <4 x i32>
171 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0
172 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1
173 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2
174 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3
175 ret [4 x <4 x i32>] %.fca.3.insert
178 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) {
179 ; CHECK-LABEL: test_vld1q_dup_bf16:
180 ; CHECK: @ %bb.0: @ %entry
181 ; CHECK-NEXT: vld1.16 {d0[], d1[]}, [r0:16]
184 %0 = load bfloat, bfloat* %ptr, align 2
185 %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
186 %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
187 ret <8 x bfloat> %lane
190 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_bf16(bfloat* %ptr) {
191 ; CHECK-LABEL: test_vld2_bf16:
192 ; CHECK: @ %bb.0: @ %entry
193 ; CHECK-NEXT: vld2.16 {d0, d1}, [r0]
196 %0 = bitcast bfloat* %ptr to i8*
197 %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
198 %vld2_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 0
199 %vld2_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 1
200 %1 = bitcast <4 x bfloat> %vld2_v.fca.0.extract to <2 x i32>
201 %2 = bitcast <4 x bfloat> %vld2_v.fca.1.extract to <2 x i32>
202 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %1, 0
203 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
204 ret [2 x <2 x i32>] %.fca.1.insert
207 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_bf16(bfloat* %ptr) {
208 ; CHECK-LABEL: test_vld2q_bf16:
209 ; CHECK: @ %bb.0: @ %entry
210 ; CHECK-NEXT: vld2.16 {d0, d1, d2, d3}, [r0]
213 %0 = bitcast bfloat* %ptr to i8*
214 %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
215 %vld2q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 0
216 %vld2q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 1
217 %1 = bitcast <8 x bfloat> %vld2q_v.fca.0.extract to <4 x i32>
218 %2 = bitcast <8 x bfloat> %vld2q_v.fca.1.extract to <4 x i32>
219 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %1, 0
220 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
221 ret [2 x <4 x i32>] %.fca.1.insert
224 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %src.coerce) {
225 ; CHECK-LABEL: test_vld2_lane_bf16:
226 ; CHECK: @ %bb.0: @ %entry
227 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
228 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
229 ; CHECK-NEXT: vld2.16 {d0[1], d1[1]}, [r0]
232 %src.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %src.coerce, 0
233 %src.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %src.coerce, 1
234 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
235 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
236 %2 = bitcast bfloat* %ptr to i8*
237 %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
238 %vld2_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 0
239 %vld2_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 1
240 %3 = bitcast <4 x bfloat> %vld2_lane_v.fca.0.extract to <2 x i32>
241 %4 = bitcast <4 x bfloat> %vld2_lane_v.fca.1.extract to <2 x i32>
242 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %3, 0
243 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %4, 1
244 ret [2 x <2 x i32>] %.fca.1.insert
247 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %src.coerce) {
248 ; CHECK-LABEL: test_vld2q_lane_bf16:
249 ; CHECK: @ %bb.0: @ %entry
250 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
251 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
252 ; CHECK-NEXT: vld2.16 {d1[3], d3[3]}, [r0]
255 %src.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %src.coerce, 0
256 %src.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %src.coerce, 1
257 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
258 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
259 %2 = bitcast bfloat* %ptr to i8*
260 %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
261 %vld2q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 0
262 %vld2q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 1
263 %3 = bitcast <8 x bfloat> %vld2q_lane_v.fca.0.extract to <4 x i32>
264 %4 = bitcast <8 x bfloat> %vld2q_lane_v.fca.1.extract to <4 x i32>
265 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %3, 0
266 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %4, 1
267 ret [2 x <4 x i32>] %.fca.1.insert
270 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_bf16(bfloat* %ptr) {
271 ; CHECK-LABEL: test_vld3_bf16:
272 ; CHECK: @ %bb.0: @ %entry
273 ; CHECK-NEXT: vld3.16 {d0, d1, d2}, [r0]
276 %0 = bitcast bfloat* %ptr to i8*
277 %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2)
278 %vld3_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 0
279 %vld3_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 1
280 %vld3_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 2
281 %1 = bitcast <4 x bfloat> %vld3_v.fca.0.extract to <2 x i32>
282 %2 = bitcast <4 x bfloat> %vld3_v.fca.1.extract to <2 x i32>
283 %3 = bitcast <4 x bfloat> %vld3_v.fca.2.extract to <2 x i32>
284 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %1, 0
285 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
286 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
287 ret [3 x <2 x i32>] %.fca.2.insert
290 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_bf16(bfloat* %ptr) {
291 ; CHECK-LABEL: test_vld3q_bf16:
292 ; CHECK: @ %bb.0: @ %entry
293 ; CHECK-NEXT: vld3.16 {d0, d2, d4}, [r0]!
294 ; CHECK-NEXT: vld3.16 {d1, d3, d5}, [r0]
297 %0 = bitcast bfloat* %ptr to i8*
298 %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2)
299 %vld3q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 0
300 %vld3q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 1
301 %vld3q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 2
302 %1 = bitcast <8 x bfloat> %vld3q_v.fca.0.extract to <4 x i32>
303 %2 = bitcast <8 x bfloat> %vld3q_v.fca.1.extract to <4 x i32>
304 %3 = bitcast <8 x bfloat> %vld3q_v.fca.2.extract to <4 x i32>
305 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %1, 0
306 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
307 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
308 ret [3 x <4 x i32>] %.fca.2.insert
311 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %src.coerce) {
312 ; CHECK-LABEL: test_vld3_lane_bf16:
313 ; CHECK: @ %bb.0: @ %entry
314 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
315 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
316 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
317 ; CHECK-NEXT: vld3.16 {d0[1], d1[1], d2[1]}, [r0]
320 %src.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %src.coerce, 0
321 %src.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %src.coerce, 1
322 %src.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %src.coerce, 2
323 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
324 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
325 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
326 %3 = bitcast bfloat* %ptr to i8*
327 %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
328 %vld3_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 0
329 %vld3_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 1
330 %vld3_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 2
331 %4 = bitcast <4 x bfloat> %vld3_lane_v.fca.0.extract to <2 x i32>
332 %5 = bitcast <4 x bfloat> %vld3_lane_v.fca.1.extract to <2 x i32>
333 %6 = bitcast <4 x bfloat> %vld3_lane_v.fca.2.extract to <2 x i32>
334 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %4, 0
335 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %5, 1
336 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %6, 2
337 ret [3 x <2 x i32>] %.fca.2.insert
340 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %src.coerce) {
341 ; CHECK-LABEL: test_vld3q_lane_bf16:
342 ; CHECK: @ %bb.0: @ %entry
343 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
344 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
345 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
346 ; CHECK-NEXT: vld3.16 {d1[3], d3[3], d5[3]}, [r0]
349 %src.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %src.coerce, 0
350 %src.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %src.coerce, 1
351 %src.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %src.coerce, 2
352 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
353 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
354 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
355 %3 = bitcast bfloat* %ptr to i8*
356 %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
357 %vld3q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 0
358 %vld3q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 1
359 %vld3q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 2
360 %4 = bitcast <8 x bfloat> %vld3q_lane_v.fca.0.extract to <4 x i32>
361 %5 = bitcast <8 x bfloat> %vld3q_lane_v.fca.1.extract to <4 x i32>
362 %6 = bitcast <8 x bfloat> %vld3q_lane_v.fca.2.extract to <4 x i32>
363 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %4, 0
364 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %5, 1
365 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %6, 2
366 ret [3 x <4 x i32>] %.fca.2.insert
369 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_bf16(bfloat* %ptr) {
370 ; CHECK-LABEL: test_vld4_bf16:
371 ; CHECK: @ %bb.0: @ %entry
372 ; CHECK-NEXT: vld4.16 {d0, d1, d2, d3}, [r0]
375 %0 = bitcast bfloat* %ptr to i8*
376 %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2)
377 %vld4_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 0
378 %vld4_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 1
379 %vld4_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 2
380 %vld4_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 3
381 %1 = bitcast <4 x bfloat> %vld4_v.fca.0.extract to <2 x i32>
382 %2 = bitcast <4 x bfloat> %vld4_v.fca.1.extract to <2 x i32>
383 %3 = bitcast <4 x bfloat> %vld4_v.fca.2.extract to <2 x i32>
384 %4 = bitcast <4 x bfloat> %vld4_v.fca.3.extract to <2 x i32>
385 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %1, 0
386 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
387 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
388 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %4, 3
389 ret [4 x <2 x i32>] %.fca.3.insert
392 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_bf16(bfloat* %ptr) {
393 ; CHECK-LABEL: test_vld4q_bf16:
394 ; CHECK: @ %bb.0: @ %entry
395 ; CHECK-NEXT: vld4.16 {d0, d2, d4, d6}, [r0]!
396 ; CHECK-NEXT: vld4.16 {d1, d3, d5, d7}, [r0]
399 %0 = bitcast bfloat* %ptr to i8*
400 %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2)
401 %vld4q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 0
402 %vld4q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 1
403 %vld4q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 2
404 %vld4q_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 3
405 %1 = bitcast <8 x bfloat> %vld4q_v.fca.0.extract to <4 x i32>
406 %2 = bitcast <8 x bfloat> %vld4q_v.fca.1.extract to <4 x i32>
407 %3 = bitcast <8 x bfloat> %vld4q_v.fca.2.extract to <4 x i32>
408 %4 = bitcast <8 x bfloat> %vld4q_v.fca.3.extract to <4 x i32>
409 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %1, 0
410 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
411 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
412 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %4, 3
413 ret [4 x <4 x i32>] %.fca.3.insert
416 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %src.coerce) {
417 ; CHECK-LABEL: test_vld4_lane_bf16:
418 ; CHECK: @ %bb.0: @ %entry
419 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
420 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
421 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
422 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
423 ; CHECK-NEXT: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
426 %src.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %src.coerce, 0
427 %src.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %src.coerce, 1
428 %src.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %src.coerce, 2
429 %src.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %src.coerce, 3
430 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
431 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
432 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
433 %3 = bitcast <2 x i32> %src.coerce.fca.3.extract to <4 x bfloat>
434 %4 = bitcast bfloat* %ptr to i8*
435 %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
436 %vld4_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 0
437 %vld4_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 1
438 %vld4_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 2
439 %vld4_lane_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 3
440 %5 = bitcast <4 x bfloat> %vld4_lane_v.fca.0.extract to <2 x i32>
441 %6 = bitcast <4 x bfloat> %vld4_lane_v.fca.1.extract to <2 x i32>
442 %7 = bitcast <4 x bfloat> %vld4_lane_v.fca.2.extract to <2 x i32>
443 %8 = bitcast <4 x bfloat> %vld4_lane_v.fca.3.extract to <2 x i32>
444 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %5, 0
445 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %6, 1
446 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %7, 2
447 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %8, 3
448 ret [4 x <2 x i32>] %.fca.3.insert
451 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %src.coerce) {
452 ; CHECK-LABEL: test_vld4q_lane_bf16:
453 ; CHECK: @ %bb.0: @ %entry
454 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
455 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
456 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
457 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
458 ; CHECK-NEXT: vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
461 %src.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %src.coerce, 0
462 %src.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %src.coerce, 1
463 %src.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %src.coerce, 2
464 %src.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %src.coerce, 3
465 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
466 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
467 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
468 %3 = bitcast <4 x i32> %src.coerce.fca.3.extract to <8 x bfloat>
469 %4 = bitcast bfloat* %ptr to i8*
470 %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
471 %vld4q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 0
472 %vld4q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 1
473 %vld4q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 2
474 %vld4q_lane_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 3
475 %5 = bitcast <8 x bfloat> %vld4q_lane_v.fca.0.extract to <4 x i32>
476 %6 = bitcast <8 x bfloat> %vld4q_lane_v.fca.1.extract to <4 x i32>
477 %7 = bitcast <8 x bfloat> %vld4q_lane_v.fca.2.extract to <4 x i32>
478 %8 = bitcast <8 x bfloat> %vld4q_lane_v.fca.3.extract to <4 x i32>
479 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %5, 0
480 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %6, 1
481 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %7, 2
482 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %8, 3
483 ret [4 x <4 x i32>] %.fca.3.insert
486 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_dup_bf16(bfloat* %ptr) {
487 ; CHECK-LABEL: test_vld2_dup_bf16:
488 ; CHECK: @ %bb.0: @ %entry
489 ; CHECK-NEXT: vld2.16 {d0[], d1[]}, [r0]
492 %0 = bitcast bfloat* %ptr to i8*
493 %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2)
494 %vld2_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 0
495 %vld2_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 1
496 %1 = bitcast <4 x bfloat> %vld2_dup_v.fca.0.extract to <2 x i32>
497 %2 = bitcast <4 x bfloat> %vld2_dup_v.fca.1.extract to <2 x i32>
498 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %1, 0
499 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
500 ret [2 x <2 x i32>] %.fca.1.insert
503 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(bfloat* %ptr) {
504 ; CHECK-LABEL: test_vld2q_dup_bf16:
505 ; CHECK: @ %bb.0: @ %entry
506 ; CHECK-NEXT: vld2.16 {d16[], d18[]}, [r0]
507 ; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0]
510 %0 = bitcast bfloat* %ptr to i8*
511 %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2)
512 %vld2q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 0
513 %vld2q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 1
514 %1 = bitcast <8 x bfloat> %vld2q_dup_v.fca.0.extract to <4 x i32>
515 %2 = bitcast <8 x bfloat> %vld2q_dup_v.fca.1.extract to <4 x i32>
516 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %1, 0
517 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
518 ret [2 x <4 x i32>] %.fca.1.insert
521 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_dup_bf16(bfloat* %ptr) {
522 ; CHECK-LABEL: test_vld3_dup_bf16:
523 ; CHECK: @ %bb.0: @ %entry
524 ; CHECK-NEXT: vld3.16 {d0[], d1[], d2[]}, [r0]
527 %0 = bitcast bfloat* %ptr to i8*
528 %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2)
529 %vld3_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 0
530 %vld3_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 1
531 %vld3_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 2
532 %1 = bitcast <4 x bfloat> %vld3_dup_v.fca.0.extract to <2 x i32>
533 %2 = bitcast <4 x bfloat> %vld3_dup_v.fca.1.extract to <2 x i32>
534 %3 = bitcast <4 x bfloat> %vld3_dup_v.fca.2.extract to <2 x i32>
535 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %1, 0
536 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
537 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
538 ret [3 x <2 x i32>] %.fca.2.insert
541 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_dup_bf16(bfloat* %ptr) {
542 ; CHECK-LABEL: test_vld3q_dup_bf16:
543 ; CHECK: @ %bb.0: @ %entry
544 ; CHECK-NEXT: vld3.16 {d0[], d2[], d4[]}, [r0]
545 ; CHECK-NEXT: vld3.16 {d1[], d3[], d5[]}, [r0]
548 %0 = bitcast bfloat* %ptr to i8*
549 %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2)
550 %vld3q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 0
551 %vld3q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 1
552 %vld3q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 2
553 %1 = bitcast <8 x bfloat> %vld3q_dup_v.fca.0.extract to <4 x i32>
554 %2 = bitcast <8 x bfloat> %vld3q_dup_v.fca.1.extract to <4 x i32>
555 %3 = bitcast <8 x bfloat> %vld3q_dup_v.fca.2.extract to <4 x i32>
556 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %1, 0
557 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
558 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
559 ret [3 x <4 x i32>] %.fca.2.insert
562 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_dup_bf16(bfloat* %ptr) {
563 ; CHECK-LABEL: test_vld4_dup_bf16:
564 ; CHECK: @ %bb.0: @ %entry
565 ; CHECK-NEXT: vld4.16 {d0[], d1[], d2[], d3[]}, [r0]
568 %0 = bitcast bfloat* %ptr to i8*
569 %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2)
570 %vld4_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 0
571 %vld4_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 1
572 %vld4_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 2
573 %vld4_dup_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 3
574 %1 = bitcast <4 x bfloat> %vld4_dup_v.fca.0.extract to <2 x i32>
575 %2 = bitcast <4 x bfloat> %vld4_dup_v.fca.1.extract to <2 x i32>
576 %3 = bitcast <4 x bfloat> %vld4_dup_v.fca.2.extract to <2 x i32>
577 %4 = bitcast <4 x bfloat> %vld4_dup_v.fca.3.extract to <2 x i32>
578 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %1, 0
579 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1
580 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2
581 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %4, 3
582 ret [4 x <2 x i32>] %.fca.3.insert
585 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_dup_bf16(bfloat* %ptr) {
586 ; CHECK-LABEL: test_vld4q_dup_bf16:
587 ; CHECK: @ %bb.0: @ %entry
588 ; CHECK-NEXT: vld4.16 {d0[], d2[], d4[], d6[]}, [r0]
589 ; CHECK-NEXT: vld4.16 {d1[], d3[], d5[], d7[]}, [r0]
592 %0 = bitcast bfloat* %ptr to i8*
593 %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2)
594 %vld4q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 0
595 %vld4q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 1
596 %vld4q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 2
597 %vld4q_dup_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 3
598 %1 = bitcast <8 x bfloat> %vld4q_dup_v.fca.0.extract to <4 x i32>
599 %2 = bitcast <8 x bfloat> %vld4q_dup_v.fca.1.extract to <4 x i32>
600 %3 = bitcast <8 x bfloat> %vld4q_dup_v.fca.2.extract to <4 x i32>
601 %4 = bitcast <8 x bfloat> %vld4q_dup_v.fca.3.extract to <4 x i32>
602 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %1, 0
603 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1
604 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2
605 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %4, 3
606 ret [4 x <4 x i32>] %.fca.3.insert
609 define arm_aapcs_vfpcc void @test_vst1_bf16(bfloat* %ptr, <4 x bfloat> %val) {
610 ; CHECK-LABEL: test_vst1_bf16:
611 ; CHECK: @ %bb.0: @ %entry
612 ; CHECK-NEXT: vst1.16 {d0}, [r0]
615 %0 = bitcast bfloat* %ptr to i8*
616 tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2)
620 define arm_aapcs_vfpcc void @test_vst1q_bf16(bfloat* %ptr, <8 x bfloat> %val) {
621 ; CHECK-LABEL: test_vst1q_bf16:
622 ; CHECK: @ %bb.0: @ %entry
623 ; CHECK-NEXT: vst1.16 {d0, d1}, [r0]
626 %0 = bitcast bfloat* %ptr to i8*
627 tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2)
631 define arm_aapcs_vfpcc void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) {
632 ; CHECK-LABEL: test_vst1_lane_bf16:
633 ; CHECK: @ %bb.0: @ %entry
634 ; CHECK-NEXT: vmovx.f16 s0, s0
635 ; CHECK-NEXT: vstr.16 s0, [r0]
638 %0 = extractelement <4 x bfloat> %val, i32 1
639 store bfloat %0, bfloat* %ptr, align 2
643 define arm_aapcs_vfpcc void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) {
644 ; CHECK-LABEL: test_vst1q_lane_bf16:
645 ; CHECK: @ %bb.0: @ %entry
646 ; CHECK-NEXT: vmovx.f16 s0, s3
647 ; CHECK-NEXT: vstr.16 s0, [r0]
650 %0 = extractelement <8 x bfloat> %val, i32 7
651 store bfloat %0, bfloat* %ptr, align 2
655 define arm_aapcs_vfpcc void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <2 x i32>] %val.coerce) {
656 ; CHECK-LABEL: test_vst1_bf16_x2:
657 ; CHECK: @ %bb.0: @ %entry
658 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
659 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
660 ; CHECK-NEXT: vst1.16 {d0, d1}, [r0:64]
663 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
664 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
665 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
666 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
667 tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
671 define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x i32>] %val.coerce) {
672 ; CHECK-LABEL: test_vst1q_bf16_x2:
673 ; CHECK: @ %bb.0: @ %entry
674 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
675 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
676 ; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]
679 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
680 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
681 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
682 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
683 tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
687 define arm_aapcs_vfpcc void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <2 x i32>] %val.coerce) {
688 ; CHECK-LABEL: test_vst1_bf16_x3:
689 ; CHECK: @ %bb.0: @ %entry
690 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
691 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
692 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
693 ; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64]
696 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
697 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
698 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
699 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
700 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
701 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
702 tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2)
706 define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x i32>] %val.coerce) {
707 ; CHECK-LABEL: test_vst1q_bf16_x3:
708 ; CHECK: @ %bb.0: @ %entry
709 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
710 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
711 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
712 ; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64]!
713 ; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0:64]
716 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
717 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
718 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
719 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
720 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
721 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
722 tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2)
726 define arm_aapcs_vfpcc void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <2 x i32>] %val.coerce) {
727 ; CHECK-LABEL: test_vst1_bf16_x4:
728 ; CHECK: @ %bb.0: @ %entry
729 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
730 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
731 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
732 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
733 ; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]
736 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
737 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
738 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
739 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
740 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
741 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
742 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
743 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
744 tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3)
748 define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x i32>] %val.coerce) {
749 ; CHECK-LABEL: test_vst1q_bf16_x4:
750 ; CHECK: @ %bb.0: @ %entry
751 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
752 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
753 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
754 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
755 ; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]!
756 ; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0:256]
759 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
760 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
761 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
762 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
763 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
764 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
765 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
766 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
767 tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3)
771 define arm_aapcs_vfpcc void @test_vst2_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) {
772 ; CHECK-LABEL: test_vst2_bf16:
773 ; CHECK: @ %bb.0: @ %entry
774 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
775 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
776 ; CHECK-NEXT: vst2.16 {d0, d1}, [r0]
779 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
780 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
781 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
782 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
783 %2 = bitcast bfloat* %ptr to i8*
784 tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
788 define arm_aapcs_vfpcc void @test_vst2q_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) {
789 ; CHECK-LABEL: test_vst2q_bf16:
790 ; CHECK: @ %bb.0: @ %entry
791 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
792 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
793 ; CHECK-NEXT: vst2.16 {d0, d1, d2, d3}, [r0]
796 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
797 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
798 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
799 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
800 %2 = bitcast bfloat* %ptr to i8*
801 tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
805 define arm_aapcs_vfpcc void @test_vst2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) {
806 ; CHECK-LABEL: test_vst2_lane_bf16:
807 ; CHECK: @ %bb.0: @ %entry
808 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0
809 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0
810 ; CHECK-NEXT: vst2.16 {d0[1], d1[1]}, [r0]
813 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0
814 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1
815 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
816 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
817 %2 = bitcast bfloat* %ptr to i8*
818 tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
822 define arm_aapcs_vfpcc void @test_vst2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) {
823 ; CHECK-LABEL: test_vst2q_lane_bf16:
824 ; CHECK: @ %bb.0: @ %entry
825 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
826 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
827 ; CHECK-NEXT: vst2.16 {d1[3], d3[3]}, [r0]
830 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0
831 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1
832 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
833 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
834 %2 = bitcast bfloat* %ptr to i8*
835 tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
839 define arm_aapcs_vfpcc void @test_vst3_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) {
840 ; CHECK-LABEL: test_vst3_bf16:
841 ; CHECK: @ %bb.0: @ %entry
842 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
843 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
844 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
845 ; CHECK-NEXT: vst3.16 {d0, d1, d2}, [r0]
848 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
849 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
850 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
851 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
852 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
853 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
854 %3 = bitcast bfloat* %ptr to i8*
855 tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2)
859 define arm_aapcs_vfpcc void @test_vst3q_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) {
860 ; CHECK-LABEL: test_vst3q_bf16:
861 ; CHECK: @ %bb.0: @ %entry
862 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
863 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
864 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
865 ; CHECK-NEXT: vst3.16 {d0, d2, d4}, [r0]!
866 ; CHECK-NEXT: vst3.16 {d1, d3, d5}, [r0]
869 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
870 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
871 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
872 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
873 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
874 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
875 %3 = bitcast bfloat* %ptr to i8*
876 tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2)
880 define arm_aapcs_vfpcc void @test_vst3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) {
881 ; CHECK-LABEL: test_vst3_lane_bf16:
882 ; CHECK: @ %bb.0: @ %entry
883 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
884 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
885 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
886 ; CHECK-NEXT: vst3.16 {d0[1], d1[1], d2[1]}, [r0]
889 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0
890 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1
891 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2
892 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
893 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
894 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
895 %3 = bitcast bfloat* %ptr to i8*
896 tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
900 define arm_aapcs_vfpcc void @test_vst3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) {
901 ; CHECK-LABEL: test_vst3q_lane_bf16:
902 ; CHECK: @ %bb.0: @ %entry
903 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
904 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
905 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
906 ; CHECK-NEXT: vst3.16 {d1[3], d3[3], d5[3]}, [r0]
909 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0
910 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1
911 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2
912 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
913 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
914 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
915 %3 = bitcast bfloat* %ptr to i8*
916 tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
920 define arm_aapcs_vfpcc void @test_vst4_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) {
921 ; CHECK-LABEL: test_vst4_bf16:
922 ; CHECK: @ %bb.0: @ %entry
923 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
924 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
925 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
926 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
927 ; CHECK-NEXT: vst4.16 {d0, d1, d2, d3}, [r0]
930 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
931 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
932 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
933 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
934 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
935 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
936 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
937 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
938 %4 = bitcast bfloat* %ptr to i8*
939 tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2)
943 define arm_aapcs_vfpcc void @test_vst4q_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) {
944 ; CHECK-LABEL: test_vst4q_bf16:
945 ; CHECK: @ %bb.0: @ %entry
946 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
947 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
948 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
949 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
950 ; CHECK-NEXT: vst4.16 {d0, d2, d4, d6}, [r0]!
951 ; CHECK-NEXT: vst4.16 {d1, d3, d5, d7}, [r0]
954 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
955 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
956 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
957 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
958 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
959 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
960 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
961 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
962 %4 = bitcast bfloat* %ptr to i8*
963 tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2)
967 define arm_aapcs_vfpcc void @test_vst4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) {
968 ; CHECK-LABEL: test_vst4_lane_bf16:
969 ; CHECK: @ %bb.0: @ %entry
970 ; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1
971 ; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1
972 ; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
973 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
974 ; CHECK-NEXT: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0]
977 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0
978 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1
979 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2
980 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3
981 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
982 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
983 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
984 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
985 %4 = bitcast bfloat* %ptr to i8*
986 tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
990 define arm_aapcs_vfpcc void @test_vst4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) {
991 ; CHECK-LABEL: test_vst4q_lane_bf16:
992 ; CHECK: @ %bb.0: @ %entry
993 ; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
994 ; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
995 ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
996 ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
997 ; CHECK-NEXT: vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0]
1000 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0
1001 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1
1002 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2
1003 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3
1004 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
1005 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
1006 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
1007 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
1008 %4 = bitcast bfloat* %ptr to i8*
1009 tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
1013 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8*, i32)
1014 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8*, i32)
1015 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8*, i32)
1016 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8*, i32)
1017 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8*, i32)
1018 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8*, i32)
1020 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8*, i32)
1021 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8*, i32)
1022 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8*, i32)
1023 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8*, i32)
1024 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8*, i32)
1025 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8*, i32)
1027 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat*)
1028 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat*)
1029 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat*)
1030 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat*)
1031 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat*)
1032 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat*)
1034 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, i32, i32)
1035 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, i32, i32)
1036 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1037 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1038 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1039 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1041 declare void @llvm.arm.neon.vst1.p0i8.v4bf16(i8*, <4 x bfloat>, i32)
1042 declare void @llvm.arm.neon.vst1.p0i8.v8bf16(i8*, <8 x bfloat>, i32)
1043 declare void @llvm.arm.neon.vst2.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32)
1044 declare void @llvm.arm.neon.vst2.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32)
1045 declare void @llvm.arm.neon.vst3.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1046 declare void @llvm.arm.neon.vst3.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1047 declare void @llvm.arm.neon.vst4.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1048 declare void @llvm.arm.neon.vst4.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1050 declare void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>)
1051 declare void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>)
1052 declare void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1053 declare void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1054 declare void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
1055 declare void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
1057 declare void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32, i32)
1058 declare void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32, i32)
1059 declare void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1060 declare void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)
1061 declare void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32)
1062 declare void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32)