1 ; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NEON
2 ; RUN: opt < %s -mattr=-neon -interleaved-access -S | FileCheck %s -check-prefix=NO_NEON
4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
5 target triple = "aarch64--linux-gnu"
7 define void @load_factor2(<16 x i8>* %ptr) {
8 ; NEON-LABEL: @load_factor2(
9 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to <8 x i8>*
10 ; NEON-NEXT: [[LDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
11 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 1
12 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 0
14 ; NO_NEON-LABEL: @load_factor2(
15 ; NO_NEON-NOT: @llvm.aarch64.neon
18 %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
19 %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20 %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
24 define void @load_factor3(<12 x i32>* %ptr) {
25 ; NEON-LABEL: @load_factor3(
26 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
27 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
28 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
29 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
30 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
32 ; NO_NEON-LABEL: @load_factor3(
33 ; NO_NEON-NOT: @llvm.aarch64.neon
36 %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
37 %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
38 %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
39 %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
43 define void @load_factor4(<16 x i32>* %ptr) {
44 ; NEON-LABEL: @load_factor4(
45 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
46 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
47 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
48 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
49 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
50 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
52 ; NO_NEON-LABEL: @load_factor4(
53 ; NO_NEON-NOT: @llvm.aarch64.neon
56 %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
57 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
58 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
59 %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
60 %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
64 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
65 ; NEON-LABEL: @store_factor2(
66 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
67 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
68 ; NEON-NEXT: [[TMP3:%.*]] = bitcast <16 x i8>* %ptr to <8 x i8>*
69 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8>* [[TMP3]])
71 ; NO_NEON-LABEL: @store_factor2(
72 ; NO_NEON-NOT: @llvm.aarch64.neon
75 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
76 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
80 define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
81 ; NEON-LABEL: @store_factor3(
82 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
83 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
84 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
85 ; NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
86 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
88 ; NO_NEON-LABEL: @store_factor3(
89 ; NO_NEON-NOT: @llvm.aarch64.neon
92 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
93 %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
94 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
95 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
99 define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
100 ; NEON-LABEL: @store_factor4(
101 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
102 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
103 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
104 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
105 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
106 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
107 ; NEON-NEXT: ret void
108 ; NO_NEON-LABEL: @store_factor4(
109 ; NO_NEON-NOT: @llvm.aarch64.neon
112 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
113 %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
114 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
115 store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
119 define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
120 ; NEON-LABEL: @load_ptrvec_factor2(
121 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <4 x i32*>* %ptr to <2 x i64>*
122 ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
123 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
124 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
125 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
126 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
127 ; NEON-NEXT: ret void
128 ; NO_NEON-LABEL: @load_ptrvec_factor2(
129 ; NO_NEON-NOT: @llvm.aarch64.neon
132 %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
133 %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
134 %v1 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 1, i32 3>
138 define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
139 ; NEON-LABEL: @load_ptrvec_factor3(
140 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <6 x i32*>* %ptr to <2 x i64>*
141 ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
142 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
143 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
144 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
145 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
146 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
147 ; NEON-NEXT: [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x i32*>
148 ; NEON-NEXT: ret void
149 ; NO_NEON-LABEL: @load_ptrvec_factor3(
150 ; NO_NEON-NOT: @llvm.aarch64.neon
153 %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
154 %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
155 %v1 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
156 %v2 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
160 define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
161 ; NEON-LABEL: @load_ptrvec_factor4(
162 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to <2 x i64>*
163 ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP1]])
164 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 3
165 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x i32*>
166 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
167 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x i32*>
168 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
169 ; NEON-NEXT: [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x i32*>
170 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
171 ; NEON-NEXT: [[TMP9:%.*]] = inttoptr <2 x i64> [[TMP8]] to <2 x i32*>
172 ; NEON-NEXT: ret void
173 ; NO_NEON-LABEL: @load_ptrvec_factor4(
174 ; NO_NEON-NOT: @llvm.aarch64.neon
177 %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
178 %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
179 %v1 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
180 %v2 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
181 %v3 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
185 define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
186 ; NEON-LABEL: @store_ptrvec_factor2(
187 ; NEON-NEXT: [[TMP1:%.*]] = ptrtoint <2 x i32*> %v0 to <2 x i64>
188 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <2 x i32*> %v1 to <2 x i64>
189 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
190 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
191 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <4 x i32*>* %ptr to <2 x i64>*
192 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]])
193 ; NEON-NEXT: ret void
194 ; NO_NEON-LABEL: @store_ptrvec_factor2(
195 ; NO_NEON-NOT: @llvm.aarch64.neon
198 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
199 store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
203 define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
204 ; NEON-LABEL: @store_ptrvec_factor3(
205 ; NEON: [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i64>
206 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i64>
207 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
208 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
209 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
210 ; NEON-NEXT: [[TMP6:%.*]] = bitcast <6 x i32*>* %ptr to <2 x i64>*
211 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]])
212 ; NEON-NEXT: ret void
213 ; NO_NEON-LABEL: @store_ptrvec_factor3(
214 ; NO_NEON-NOT: @llvm.aarch64.neon
217 %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
218 %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
219 %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
220 store <6 x i32*> %interleaved.vec, <6 x i32*>* %ptr, align 4
224 define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
225 ; NEON-LABEL: @store_ptrvec_factor4(
226 ; NEON: [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i64>
227 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i64>
228 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
229 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
230 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
231 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 6, i32 7>
232 ; NEON-NEXT: [[TMP7:%.*]] = bitcast <8 x i32*>* %ptr to <2 x i64>*
233 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]])
234 ; NEON-NEXT: ret void
235 ; NO_NEON-LABEL: @store_ptrvec_factor4(
236 ; NO_NEON-NOT: @llvm.aarch64.neon
239 %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
240 %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
241 %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
242 store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr, align 4
246 define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
247 ; NEON-LABEL: @load_undef_mask_factor2(
248 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
249 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
250 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
251 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
252 ; NEON-NEXT: ret void
253 ; NO_NEON-LABEL: @load_undef_mask_factor2(
254 ; NO_NEON-NOT: @llvm.aarch64.neon
257 %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
258 %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
259 %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
263 define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
264 ; NEON-LABEL: @load_undef_mask_factor3(
265 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
266 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
267 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
268 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
269 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
270 ; NEON-NEXT: ret void
271 ; NO_NEON-LABEL: @load_undef_mask_factor3(
272 ; NO_NEON-NOT: @llvm.aarch64.neon
275 %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
276 %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
277 %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
278 %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
282 define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
283 ; NEON-LABEL: @load_undef_mask_factor4(
284 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
285 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP1]])
286 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
287 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
288 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
289 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
290 ; NEON-NEXT: ret void
291 ; NO_NEON-LABEL: @load_undef_mask_factor4(
292 ; NO_NEON-NOT: @llvm.aarch64.neon
295 %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
296 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
297 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
298 %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
299 %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
303 define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
304 ; NEON-LABEL: @store_undef_mask_factor2(
305 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
306 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
307 ; NEON-NEXT: [[TMP3:%.*]] = bitcast <8 x i32>* %ptr to <4 x i32>*
308 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]])
309 ; NEON-NEXT: ret void
310 ; NO_NEON-LABEL: @store_undef_mask_factor2(
311 ; NO_NEON-NOT: @llvm.aarch64.neon
314 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
315 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
319 define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
320 ; NEON-LABEL: @store_undef_mask_factor3(
321 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
322 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
323 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
324 ; NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
325 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
326 ; NEON-NEXT: ret void
327 ; NO_NEON-LABEL: @store_undef_mask_factor3(
328 ; NO_NEON-NOT: @llvm.aarch64.neon
331 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
332 %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
333 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
334 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
338 define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
339 ; NEON-LABEL: @store_undef_mask_factor4(
340 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
341 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
342 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
343 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
344 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* %ptr to <4 x i32>*
345 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
346 ; NEON-NEXT: ret void
347 ; NO_NEON-LABEL: @store_undef_mask_factor4(
348 ; NO_NEON-NOT: @llvm.aarch64.neon
351 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
352 %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
353 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
354 store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
358 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
359 ; NEON-LABEL: @load_illegal_factor2(
360 ; NEON-NOT: @llvm.aarch64.neon
362 ; NO_NEON-LABEL: @load_illegal_factor2(
363 ; NO_NEON-NOT: @llvm.aarch64.neon
366 %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
367 %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
371 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
372 ; NEON-LABEL: @store_illegal_factor2(
373 ; NEON-NOT: @llvm.aarch64.neon
375 ; NO_NEON-LABEL: @store_illegal_factor2(
376 ; NO_NEON-NOT: @llvm.aarch64.neon
379 %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
380 store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
384 define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
385 ; NEON-LABEL: @store_general_mask_factor4(
386 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
387 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
388 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
389 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
390 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
391 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
392 ; NEON-NEXT: ret void
393 ; NO_NEON-LABEL: @store_general_mask_factor4(
394 ; NO_NEON-NOT: @llvm.aarch64.neon
397 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
398 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
402 define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
403 ; NEON-LABEL: @store_general_mask_factor4_undefbeg(
404 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
405 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
406 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
407 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
408 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
409 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
410 ; NEON-NEXT: ret void
411 ; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
412 ; NO_NEON-NOT: @llvm.aarch64.neon
415 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
416 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
420 define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
421 ; NEON-LABEL: @store_general_mask_factor4_undefend(
422 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
423 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
424 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
425 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
426 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
427 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
428 ; NEON-NEXT: ret void
429 ; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
430 ; NO_NEON-NOT: @llvm.aarch64.neon
433 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
434 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
438 define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
439 ; NEON-LABEL: @store_general_mask_factor4_undefmid(
440 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
441 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
442 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
443 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
444 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
445 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
446 ; NEON-NEXT: ret void
447 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
448 ; NO_NEON-NOT: @llvm.aarch64.neon
451 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
452 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
456 define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
457 ; NEON-LABEL: @store_general_mask_factor4_undefmulti(
458 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
459 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
460 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
461 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
462 ; NEON-NEXT: [[TMP5:%.*]] = bitcast <8 x i32>* %ptr to <2 x i32>*
463 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]])
464 ; NEON-NEXT: ret void
465 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
466 ; NO_NEON-NOT: @llvm.aarch64.neon
469 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
470 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
474 define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
475 ; NEON-LABEL: @store_general_mask_factor3(
476 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
477 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
478 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
479 ; NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
480 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
481 ; NEON-NEXT: ret void
482 ; NO_NEON-LABEL: @store_general_mask_factor3(
483 ; NO_NEON-NOT: @llvm.aarch64.neon
486 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
487 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
491 define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
492 ; NEON-LABEL: @store_general_mask_factor3_undefmultimid(
493 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
494 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
495 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
496 ; NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
497 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
498 ; NEON-NEXT: ret void
499 ; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
500 ; NO_NEON-NOT: @llvm.aarch64.neon
503 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
504 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
508 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
509 ; NEON-LABEL: @store_general_mask_factor3_undef_fail(
510 ; NEON-NOT: @llvm.aarch64.neon
512 ; NO_NEON-LABEL: @store_general_mask_factor3_undef_fail(
513 ; NO_NEON-NOT: @llvm.aarch64.neon
516 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
517 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
521 define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
522 ; NEON-LABEL: @store_general_mask_factor3_undeflane(
523 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
524 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
525 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
526 ; NEON-NEXT: [[TMP4:%.*]] = bitcast <12 x i32>* %ptr to <4 x i32>*
527 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
528 ; NEON-NEXT: ret void
529 ; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
530 ; NO_NEON-NOT: @llvm.aarch64.neon
533 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
534 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
538 define void @store_general_mask_factor3_negativestart(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
539 ; NEON-LABEL: @store_general_mask_factor3_negativestart(
540 ; NEON-NOT: @llvm.aarch64.neon
542 ; NO_NEON-LABEL: @store_general_mask_factor3_negativestart(
543 ; NO_NEON-NOT: @llvm.aarch64.neon
546 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
547 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
551 @g = external global <4 x float>
553 ; The following does not give a valid interleaved store
554 ; NEON-LABEL: define void @no_interleave
555 ; NEON-NOT: call void @llvm.aarch64.neon.st2
556 ; NEON: shufflevector
559 ; NO_NEON-LABEL: define void @no_interleave
560 ; NO_NEON: shufflevector
563 define void @no_interleave(<4 x float> %a0) {
564 %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 3, i32 7, i32 undef>
565 store <4 x float> %v0, <4 x float>* @g, align 16
569 define void @load_factor2_wide2(<16 x i32>* %ptr) {
570 ; NEON-LABEL: @load_factor2_wide2(
571 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
572 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
573 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
574 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
575 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
576 ; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
577 ; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
578 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
579 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
580 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
581 ; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
582 ; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
583 ; NEON-NEXT: ret void
584 ; NO_NEON-LABEL: @load_factor2_wide2(
585 ; NO_NEON-NOT: @llvm.aarch64.neon
588 %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
589 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
590 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
594 define void @load_factor2_wide3(<24 x i32>* %ptr) {
595 ; NEON-LABEL: @load_factor2_wide3(
596 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
597 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
598 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
599 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
600 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
601 ; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
602 ; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
603 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
604 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
605 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
606 ; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
607 ; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
608 ; NEON-NEXT: [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP10]])
609 ; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1
610 ; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0
611 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
612 ; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
613 ; NEON-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
614 ; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
615 ; NEON-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
616 ; NEON-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
617 ; NEON-NEXT: ret void
618 ; NO_NEON-LABEL: @load_factor2_wide3(
619 ; NO_NEON-NOT: @llvm.aarch64.neon
622 %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
623 %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
624 %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
628 define void @load_factor3_wide(<24 x i32>* %ptr) {
629 ; NEON-LABEL: @load_factor3_wide(
630 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
631 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
632 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
633 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
634 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
635 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
636 ; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
637 ; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
638 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]])
639 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
640 ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
641 ; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
642 ; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
643 ; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
644 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
645 ; NEON-NEXT: ret void
646 ; NO_NEON-LABEL: @load_factor3_wide(
647 ; NO_NEON-NOT: @llvm.aarch64.neon
650 %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
651 %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
652 %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
653 %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
657 define void @load_factor4_wide(<32 x i32>* %ptr) {
658 ; NEON-LABEL: @load_factor4_wide(
659 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
660 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
661 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
662 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
663 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
664 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
665 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
666 ; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
667 ; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
668 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]])
669 ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3
670 ; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
671 ; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
672 ; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
673 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
674 ; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
675 ; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
676 ; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
677 ; NEON-NEXT: ret void
678 ; NO_NEON-LABEL: @load_factor4_wide(
679 ; NO_NEON-NOT: @llvm.aarch64.neon
682 %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
683 %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
684 %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
685 %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
686 %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
690 define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
691 ; NEON-LABEL: @store_factor2_wide(
692 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
693 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
694 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
695 ; NEON-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
696 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
697 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
698 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
699 ; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
700 ; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
701 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]])
702 ; NEON-NEXT: ret void
703 ; NO_NEON-LABEL: @store_factor2_wide(
706 %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
707 store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
711 define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
712 ; NEON-LABEL: @store_factor3_wide(
713 ; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
714 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
715 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
716 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
717 ; NEON-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
718 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
719 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
720 ; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
721 ; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
722 ; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
723 ; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
724 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]])
725 ; NEON-NEXT: ret void
726 ; NO_NEON-LABEL: @store_factor3_wide(
729 %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
730 %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
731 %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
732 store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
736 define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
737 ; NEON-LABEL: @store_factor4_wide(
738 ; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
739 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
740 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
741 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
742 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
743 ; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
744 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]])
745 ; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
746 ; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
747 ; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
748 ; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
749 ; NEON-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
750 ; NEON-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
751 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]])
752 ; NEON-NEXT: ret void
753 ; NO_NEON-LABEL: @store_factor4_wide(
754 ; NO_NEON-NOT: @llvm.aarch64.neon
757 %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
758 %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
759 %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
760 store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
764 define void @load_factor2_fp128(<4 x fp128>* %ptr) {
765 ; NEON-LABEL: @load_factor2_fp128(
766 ; NEON-NOT: @llvm.aarch64.neon
768 ; NO_NEON-LABEL: @load_factor2_fp128(
769 ; NO_NEON-NOT: @llvm.aarch64.neon
772 %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
773 %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
774 %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
778 define <4 x i1> @load_large_vector(<12 x i64 *>* %p) {
779 ; NEON-LABEL: @load_large_vector(
780 ; NEON: [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>*
781 ; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
782 ; NEON-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x i64*>
783 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
784 ; NEON-NEXT: [[TMP4:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x i64*>
785 ; NEON: [[LDN1:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>*
786 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 1
787 ; NEON-NEXT: [[TMP6:%.*]] = inttoptr <2 x i64> [[TMP5]] to <2 x i64*>
788 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 0
789 ; NEON-NEXT: [[TMP8:%.*]] = inttoptr <2 x i64> [[TMP7]] to <2 x i64*>
790 ; NEON-NEXT: shufflevector <2 x i64*> [[TMP2]], <2 x i64*> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
791 ; NEON-NEXT: shufflevector <2 x i64*> [[TMP4]], <2 x i64*> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
792 ; NO_NEON-LABEL: @load_large_vector(
793 ; NO_NEON-NOT: @llvm.aarch64.neon
796 %l = load <12 x i64 *>, <12 x i64 *>* %p
797 %s1 = shufflevector <12 x i64 *> %l, <12 x i64 *> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
798 %s2 = shufflevector <12 x i64 *> %l, <12 x i64 *> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
799 %ret = icmp ne <4 x i64 *> %s1, %s2