1 ; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
4 define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
5 ; CHECK-LABEL: load_64:
8 %0 = load <4 x half>, <4 x half>* %a, align 8
12 ; Simple load of v8i16
13 define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
14 ; CHECK-LABEL: load_128:
17 %0 = load <8 x half>, <8 x half>* %a, align 16
21 ; Duplicating load to v4i16
22 define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
23 ; CHECK-LABEL: load_dup_64:
24 ; CHECK: ld1r { v0.4h }, [x0]
26 %0 = load half, half* %a, align 2
27 %1 = insertelement <4 x half> undef, half %0, i32 0
28 %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
32 ; Duplicating load to v8i16
33 define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
34 ; CHECK-LABEL: load_dup_128:
35 ; CHECK: ld1r { v0.8h }, [x0]
37 %0 = load half, half* %a, align 2
38 %1 = insertelement <8 x half> undef, half %0, i32 0
39 %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
43 ; Load to one lane of v4f16
44 define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
45 ; CHECK-LABEL: load_lane_64:
46 ; CHECK: ld1 { v0.h }[2], [x0]
48 %0 = load half, half* %a, align 2
49 %1 = insertelement <4 x half> %b, half %0, i32 2
53 ; Load to one lane of v8f16
54 define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
55 ; CHECK-LABEL: load_lane_128:
56 ; CHECK: ld1 { v0.h }[5], [x0]
58 %0 = load half, half* %a, align 2
59 %1 = insertelement <8 x half> %b, half %0, i32 5
63 ; Simple store of v4f16
64 define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
65 ; CHECK-LABEL: store_64:
68 store <4 x half> %b, <4 x half>* %a, align 8
72 ; Simple store of v8f16
73 define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
74 ; CHECK-LABEL: store_128:
77 store <8 x half> %b, <8 x half>* %a, align 16
81 ; Store from one lane of v4f16
82 define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
83 ; CHECK-LABEL: store_lane_64:
84 ; CHECK: st1 { v0.h }[2], [x0]
86 %0 = extractelement <4 x half> %b, i32 2
87 store half %0, half* %a, align 2
91 define void @store_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
92 ; CHECK-LABEL: store_lane0_64:
95 %0 = extractelement <4 x half> %b, i32 0
96 store half %0, half* %a, align 2
100 define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
101 ; CHECK-LABEL: storeu_lane0_64:
102 ; CHECK: stur h0, [x{{[0-9]+}}, #-2]
104 %0 = getelementptr half, half* %a, i64 -1
105 %1 = extractelement <4 x half> %b, i32 0
106 store half %1, half* %0, align 2
110 define void @storero_lane_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
111 ; CHECK-LABEL: storero_lane_64:
112 ; CHECK: st1 { v0.h }[2], [x{{[0-9]+}}]
114 %0 = getelementptr half, half* %a, i64 %c
115 %1 = extractelement <4 x half> %b, i32 2
116 store half %1, half* %0, align 2
120 define void @storero_lane0_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
121 ; CHECK-LABEL: storero_lane0_64:
122 ; CHECK: str h0, [x0, x1, lsl #1]
124 %0 = getelementptr half, half* %a, i64 %c
125 %1 = extractelement <4 x half> %b, i32 0
126 store half %1, half* %0, align 2
130 ; Store from one lane of v8f16
131 define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
132 ; CHECK-LABEL: store_lane_128:
133 ; CHECK: st1 { v0.h }[5], [x0]
135 %0 = extractelement <8 x half> %b, i32 5
136 store half %0, half* %a, align 2
140 define void @store_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
141 ; CHECK-LABEL: store_lane0_128:
142 ; CHECK: str h0, [x0]
144 %0 = extractelement <8 x half> %b, i32 0
145 store half %0, half* %a, align 2
149 define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
150 ; CHECK-LABEL: storeu_lane0_128:
151 ; CHECK: stur h0, [x{{[0-9]+}}, #-2]
153 %0 = getelementptr half, half* %a, i64 -1
154 %1 = extractelement <8 x half> %b, i32 0
155 store half %1, half* %0, align 2
159 define void @storero_lane_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
160 ; CHECK-LABEL: storero_lane_128:
161 ; CHECK: st1 { v0.h }[4], [x{{[0-9]+}}]
163 %0 = getelementptr half, half* %a, i64 %c
164 %1 = extractelement <8 x half> %b, i32 4
165 store half %1, half* %0, align 2
169 define void @storero_lane0_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
170 ; CHECK-LABEL: storero_lane0_128:
171 ; CHECK: str h0, [x0, x1, lsl #1]
173 %0 = getelementptr half, half* %a, i64 %c
174 %1 = extractelement <8 x half> %b, i32 0
175 store half %1, half* %0, align 2
179 ; NEON intrinsics - (de-)interleaving loads and stores
180 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
181 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
182 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
183 declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
184 declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
185 declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
186 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
187 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
188 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
189 declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
190 declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
191 declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
193 ; Load 2 x v4f16 with de-interleaving
194 define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
195 ; CHECK-LABEL: load_interleave_64_2:
196 ; CHECK: ld2 { v0.4h, v1.4h }, [x0]
198 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
199 ret { <4 x half>, <4 x half> } %0
202 ; Load 3 x v4f16 with de-interleaving
203 define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
204 ; CHECK-LABEL: load_interleave_64_3:
205 ; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
207 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
208 ret { <4 x half>, <4 x half>, <4 x half> } %0
211 ; Load 4 x v4f16 with de-interleaving
212 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
213 ; CHECK-LABEL: load_interleave_64_4:
214 ; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
216 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
217 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
220 ; Store 2 x v4f16 with interleaving
221 define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
222 ; CHECK-LABEL: store_interleave_64_2:
223 ; CHECK: st2 { v0.4h, v1.4h }, [x0]
225 tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
229 ; Store 3 x v4f16 with interleaving
230 define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
231 ; CHECK-LABEL: store_interleave_64_3:
232 ; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
234 tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
238 ; Store 4 x v4f16 with interleaving
239 define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
240 ; CHECK-LABEL: store_interleave_64_4:
241 ; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
243 tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
247 ; Load 2 x v8f16 with de-interleaving
248 define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
249 ; CHECK-LABEL: load_interleave_128_2:
250 ; CHECK: ld2 { v0.8h, v1.8h }, [x0]
252 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
253 ret { <8 x half>, <8 x half> } %0
256 ; Load 3 x v8f16 with de-interleaving
257 define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
258 ; CHECK-LABEL: load_interleave_128_3:
259 ; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
261 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
262 ret { <8 x half>, <8 x half>, <8 x half> } %0
265 ; Load 8 x v8f16 with de-interleaving
266 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
267 ; CHECK-LABEL: load_interleave_128_4:
268 ; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
270 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
271 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
274 ; Store 2 x v8f16 with interleaving
275 define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
276 ; CHECK-LABEL: store_interleave_128_2:
277 ; CHECK: st2 { v0.8h, v1.8h }, [x0]
279 tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
283 ; Store 3 x v8f16 with interleaving
284 define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
285 ; CHECK-LABEL: store_interleave_128_3:
286 ; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
288 tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
292 ; Store 8 x v8f16 with interleaving
293 define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
294 ; CHECK-LABEL: store_interleave_128_4:
295 ; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
297 tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
301 ; NEON intrinsics - duplicating loads
302 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
303 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
304 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
305 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
306 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
307 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
309 ; Load 2 x v4f16 with duplication
310 define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
311 ; CHECK-LABEL: load_dup_64_2:
312 ; CHECK: ld2r { v0.4h, v1.4h }, [x0]
314 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
315 ret { <4 x half>, <4 x half> } %0
318 ; Load 3 x v4f16 with duplication
319 define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
320 ; CHECK-LABEL: load_dup_64_3:
321 ; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
323 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
324 ret { <4 x half>, <4 x half>, <4 x half> } %0
327 ; Load 4 x v4f16 with duplication
328 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
329 ; CHECK-LABEL: load_dup_64_4:
330 ; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
332 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
333 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
336 ; Load 2 x v8f16 with duplication
337 define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
338 ; CHECK-LABEL: load_dup_128_2:
339 ; CHECK: ld2r { v0.8h, v1.8h }, [x0]
341 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
342 ret { <8 x half>, <8 x half> } %0
345 ; Load 3 x v8f16 with duplication
346 define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
347 ; CHECK-LABEL: load_dup_128_3:
348 ; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
350 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
351 ret { <8 x half>, <8 x half>, <8 x half> } %0
354 ; Load 8 x v8f16 with duplication
355 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
356 ; CHECK-LABEL: load_dup_128_4:
357 ; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
359 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
360 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
364 ; NEON intrinsics - loads and stores to/from one lane
365 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
366 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
367 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
368 declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
369 declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
370 declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
371 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
372 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
373 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
374 declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
375 declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
376 declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
378 ; Load one lane of 2 x v4f16
379 define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
380 ; CHECK-LABEL: load_lane_64_2:
381 ; CHECK: ld2 { v0.h, v1.h }[2], [x0]
383 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
384 ret { <4 x half>, <4 x half> } %0
387 ; Load one lane of 3 x v4f16
388 define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
389 ; CHECK-LABEL: load_lane_64_3:
390 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
392 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
393 ret { <4 x half>, <4 x half>, <4 x half> } %0
396 ; Load one lane of 4 x v4f16
397 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
398 ; CHECK-LABEL: load_lane_64_4:
399 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
401 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
402 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
405 ; Store one lane of 2 x v4f16
406 define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
407 ; CHECK-LABEL: store_lane_64_2:
408 ; CHECK: st2 { v0.h, v1.h }[2], [x0]
410 tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
414 ; Store one lane of 3 x v4f16
415 define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
416 ; CHECK-LABEL: store_lane_64_3:
417 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
419 tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
423 ; Store one lane of 4 x v4f16
424 define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
425 ; CHECK-LABEL: store_lane_64_4:
426 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
428 tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
432 ; Load one lane of 2 x v8f16
433 define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
434 ; CHECK-LABEL: load_lane_128_2:
435 ; CHECK: ld2 { v0.h, v1.h }[2], [x0]
437 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
438 ret { <8 x half>, <8 x half> } %0
441 ; Load one lane of 3 x v8f16
442 define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
443 ; CHECK-LABEL: load_lane_128_3:
444 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
446 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
447 ret { <8 x half>, <8 x half>, <8 x half> } %0
450 ; Load one lane of 8 x v8f16
451 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
452 ; CHECK-LABEL: load_lane_128_4:
453 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
455 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
456 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
459 ; Store one lane of 2 x v8f16
460 define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
461 ; CHECK-LABEL: store_lane_128_2:
462 ; CHECK: st2 { v0.h, v1.h }[2], [x0]
464 tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
468 ; Store one lane of 3 x v8f16
469 define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
470 ; CHECK-LABEL: store_lane_128_3:
471 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
473 tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
477 ; Store one lane of 8 x v8f16
478 define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
479 ; CHECK-LABEL: store_lane_128_4:
480 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
482 tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
486 ; NEON intrinsics - load/store without interleaving
487 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
488 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
489 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
490 declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
491 declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
492 declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
493 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
494 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
495 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
496 declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
497 declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
498 declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
500 ; Load 2 x v4f16 without de-interleaving
501 define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
502 ; CHECK-LABEL: load_64_2:
503 ; CHECK: ld1 { v0.4h, v1.4h }, [x0]
505 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
506 ret { <4 x half>, <4 x half> } %0
509 ; Load 3 x v4f16 without de-interleaving
510 define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
511 ; CHECK-LABEL: load_64_3:
512 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
514 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
515 ret { <4 x half>, <4 x half>, <4 x half> } %0
518 ; Load 4 x v4f16 without de-interleaving
519 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
520 ; CHECK-LABEL: load_64_4:
521 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
523 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
524 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
527 ; Store 2 x v4f16 without interleaving
528 define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
529 ; CHECK-LABEL: store_64_2:
530 ; CHECK: st1 { v0.4h, v1.4h }, [x0]
532 tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
536 ; Store 3 x v4f16 without interleaving
537 define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
538 ; CHECK-LABEL: store_64_3:
539 ; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
541 tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
545 ; Store 4 x v4f16 without interleaving
546 define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
547 ; CHECK-LABEL: store_64_4:
548 ; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
550 tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
554 ; Load 2 x v8f16 without de-interleaving
555 define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
556 ; CHECK-LABEL: load_128_2:
557 ; CHECK: ld1 { v0.8h, v1.8h }, [x0]
559 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
560 ret { <8 x half>, <8 x half> } %0
563 ; Load 3 x v8f16 without de-interleaving
564 define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
565 ; CHECK-LABEL: load_128_3:
566 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
568 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
569 ret { <8 x half>, <8 x half>, <8 x half> } %0
572 ; Load 8 x v8f16 without de-interleaving
573 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
574 ; CHECK-LABEL: load_128_4:
575 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
577 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
578 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
581 ; Store 2 x v8f16 without interleaving
582 define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
583 ; CHECK-LABEL: store_128_2:
584 ; CHECK: st1 { v0.8h, v1.8h }, [x0]
586 tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
590 ; Store 3 x v8f16 without interleaving
591 define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
592 ; CHECK-LABEL: store_128_3:
593 ; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
595 tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
599 ; Store 8 x v8f16 without interleaving
600 define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
601 ; CHECK-LABEL: store_128_4:
602 ; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
604 tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)