1 ; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NEON
2 ; RUN: opt < %s -mattr=-neon -interleaved-access -S | FileCheck %s -check-prefix=NO_NEON
3 ; RUN: opt < %s -passes=interleaved-access -S | FileCheck %s -check-prefix=NEON
4 ; RUN: opt < %s -mattr=-neon -passes=interleaved-access -S | FileCheck %s -check-prefix=NO_NEON
6 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
7 target triple = "aarch64--linux-gnu"
9 define void @load_factor2(ptr %ptr) {
10 ; NEON-LABEL: @load_factor2(
11 ; NEON-NEXT: [[LDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %ptr)
12 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 1
13 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[LDN]], 0
15 ; NO_NEON-LABEL: @load_factor2(
16 ; NO_NEON-NOT: @llvm.aarch64.neon
19 %interleaved.vec = load <16 x i8>, ptr %ptr, align 4
20 %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
21 %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
25 define void @load_factor3(ptr %ptr) {
26 ; NEON-LABEL: @load_factor3(
27 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %ptr)
28 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
29 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
30 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
32 ; NO_NEON-LABEL: @load_factor3(
33 ; NO_NEON-NOT: @llvm.aarch64.neon
36 %interleaved.vec = load <12 x i32>, ptr %ptr, align 4
37 %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
38 %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
39 %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
43 define void @load_factor4(ptr %ptr) {
44 ; NEON-LABEL: @load_factor4(
45 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %ptr)
46 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
47 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
48 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
49 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
51 ; NO_NEON-LABEL: @load_factor4(
52 ; NO_NEON-NOT: @llvm.aarch64.neon
55 %interleaved.vec = load <16 x i32>, ptr %ptr, align 4
56 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
57 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
58 %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
59 %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
63 define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) {
64 ; NEON-LABEL: @store_factor2(
65 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
66 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
67 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr %ptr)
69 ; NO_NEON-LABEL: @store_factor2(
70 ; NO_NEON-NOT: @llvm.aarch64.neon
73 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
74 store <16 x i8> %interleaved.vec, ptr %ptr, align 4
78 define void @store_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
79 ; NEON-LABEL: @store_factor3(
80 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
81 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
82 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
83 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr %ptr)
85 ; NO_NEON-LABEL: @store_factor3(
86 ; NO_NEON-NOT: @llvm.aarch64.neon
89 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
90 %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
91 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
92 store <12 x i32> %interleaved.vec, ptr %ptr, align 4
96 define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
97 ; NEON-LABEL: @store_factor4(
98 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
99 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
100 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
101 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
102 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr %ptr)
103 ; NEON-NEXT: ret void
104 ; NO_NEON-LABEL: @store_factor4(
105 ; NO_NEON-NOT: @llvm.aarch64.neon
108 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
109 %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
110 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
111 store <16 x i32> %interleaved.vec, ptr %ptr, align 4
115 define void @load_ptrvec_factor2(ptr %ptr) {
116 ; NEON-LABEL: @load_ptrvec_factor2(
117 ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %ptr)
118 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
119 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr>
120 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
121 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x ptr>
122 ; NEON-NEXT: ret void
123 ; NO_NEON-LABEL: @load_ptrvec_factor2(
124 ; NO_NEON-NOT: @llvm.aarch64.neon
127 %interleaved.vec = load <4 x ptr>, ptr %ptr, align 4
128 %v0 = shufflevector <4 x ptr> %interleaved.vec, <4 x ptr> poison, <2 x i32> <i32 0, i32 2>
129 %v1 = shufflevector <4 x ptr> %interleaved.vec, <4 x ptr> poison, <2 x i32> <i32 1, i32 3>
133 define void @load_ptrvec_factor3(ptr %ptr) {
134 ; NEON-LABEL: @load_ptrvec_factor3(
135 ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %ptr)
136 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
137 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr>
138 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
139 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x ptr>
140 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
141 ; NEON-NEXT: [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr>
142 ; NEON-NEXT: ret void
143 ; NO_NEON-LABEL: @load_ptrvec_factor3(
144 ; NO_NEON-NOT: @llvm.aarch64.neon
147 %interleaved.vec = load <6 x ptr>, ptr %ptr, align 4
148 %v0 = shufflevector <6 x ptr> %interleaved.vec, <6 x ptr> poison, <2 x i32> <i32 0, i32 3>
149 %v1 = shufflevector <6 x ptr> %interleaved.vec, <6 x ptr> poison, <2 x i32> <i32 1, i32 4>
150 %v2 = shufflevector <6 x ptr> %interleaved.vec, <6 x ptr> poison, <2 x i32> <i32 2, i32 5>
154 define void @load_ptrvec_factor4(ptr %ptr) {
155 ; NEON-LABEL: @load_ptrvec_factor4(
156 ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %ptr)
157 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 3
158 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr>
159 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 2
160 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i64> [[TMP4]] to <2 x ptr>
161 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
162 ; NEON-NEXT: [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr>
163 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
164 ; NEON-NEXT: [[TMP9:%.*]] = inttoptr <2 x i64> [[TMP8]] to <2 x ptr>
165 ; NEON-NEXT: ret void
166 ; NO_NEON-LABEL: @load_ptrvec_factor4(
167 ; NO_NEON-NOT: @llvm.aarch64.neon
170 %interleaved.vec = load <8 x ptr>, ptr %ptr, align 4
171 %v0 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <2 x i32> <i32 0, i32 4>
172 %v1 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <2 x i32> <i32 1, i32 5>
173 %v2 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <2 x i32> <i32 2, i32 6>
174 %v3 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <2 x i32> <i32 3, i32 7>
178 define void @store_ptrvec_factor2(ptr %ptr, <2 x ptr> %v0, <2 x ptr> %v1) {
179 ; NEON-LABEL: @store_ptrvec_factor2(
180 ; NEON-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> %v0 to <2 x i64>
181 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <2 x ptr> %v1 to <2 x i64>
182 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
183 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
184 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr %ptr)
185 ; NEON-NEXT: ret void
186 ; NO_NEON-LABEL: @store_ptrvec_factor2(
187 ; NO_NEON-NOT: @llvm.aarch64.neon
190 %interleaved.vec = shufflevector <2 x ptr> %v0, <2 x ptr> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
191 store <4 x ptr> %interleaved.vec, ptr %ptr, align 4
195 define void @store_ptrvec_factor3(ptr %ptr, <2 x ptr> %v0, <2 x ptr> %v1, <2 x ptr> %v2) {
196 ; NEON-LABEL: @store_ptrvec_factor3(
197 ; NEON: [[TMP1:%.*]] = ptrtoint <4 x ptr> %s0 to <4 x i64>
198 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x ptr> %s1 to <4 x i64>
199 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
200 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
201 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
202 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr %ptr)
203 ; NEON-NEXT: ret void
204 ; NO_NEON-LABEL: @store_ptrvec_factor3(
205 ; NO_NEON-NOT: @llvm.aarch64.neon
208 %s0 = shufflevector <2 x ptr> %v0, <2 x ptr> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
209 %s1 = shufflevector <2 x ptr> %v2, <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
210 %interleaved.vec = shufflevector <4 x ptr> %s0, <4 x ptr> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
211 store <6 x ptr> %interleaved.vec, ptr %ptr, align 4
215 define void @store_ptrvec_factor4(ptr %ptr, <2 x ptr> %v0, <2 x ptr> %v1, <2 x ptr> %v2, <2 x ptr> %v3) {
216 ; NEON-LABEL: @store_ptrvec_factor4(
217 ; NEON: [[TMP1:%.*]] = ptrtoint <4 x ptr> %s0 to <4 x i64>
218 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x ptr> %s1 to <4 x i64>
219 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
220 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 2, i32 3>
221 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 4, i32 5>
222 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 6, i32 7>
223 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], ptr %ptr)
224 ; NEON-NEXT: ret void
225 ; NO_NEON-LABEL: @store_ptrvec_factor4(
226 ; NO_NEON-NOT: @llvm.aarch64.neon
229 %s0 = shufflevector <2 x ptr> %v0, <2 x ptr> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
230 %s1 = shufflevector <2 x ptr> %v2, <2 x ptr> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
231 %interleaved.vec = shufflevector <4 x ptr> %s0, <4 x ptr> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
232 store <8 x ptr> %interleaved.vec, ptr %ptr, align 4
236 define void @load_undef_mask_factor2(ptr %ptr) {
237 ; NEON-LABEL: @load_undef_mask_factor2(
238 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %ptr)
239 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
240 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
241 ; NEON-NEXT: ret void
242 ; NO_NEON-LABEL: @load_undef_mask_factor2(
243 ; NO_NEON-NOT: @llvm.aarch64.neon
246 %interleaved.vec = load <8 x i32>, ptr %ptr, align 4
247 %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
248 %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
252 define void @load_undef_mask_factor3(ptr %ptr) {
253 ; NEON-LABEL: @load_undef_mask_factor3(
254 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %ptr)
255 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
256 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
257 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
258 ; NEON-NEXT: ret void
259 ; NO_NEON-LABEL: @load_undef_mask_factor3(
260 ; NO_NEON-NOT: @llvm.aarch64.neon
263 %interleaved.vec = load <12 x i32>, ptr %ptr, align 4
264 %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
265 %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
266 %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
270 define void @load_undef_mask_factor4(ptr %ptr) {
271 ; NEON-LABEL: @load_undef_mask_factor4(
272 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %ptr)
273 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
274 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
275 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
276 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
277 ; NEON-NEXT: ret void
278 ; NO_NEON-LABEL: @load_undef_mask_factor4(
279 ; NO_NEON-NOT: @llvm.aarch64.neon
282 %interleaved.vec = load <16 x i32>, ptr %ptr, align 4
283 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
284 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
285 %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
286 %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
290 define void @store_undef_mask_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
291 ; NEON-LABEL: @store_undef_mask_factor2(
292 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
293 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
294 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], ptr %ptr)
295 ; NEON-NEXT: ret void
296 ; NO_NEON-LABEL: @store_undef_mask_factor2(
297 ; NO_NEON-NOT: @llvm.aarch64.neon
300 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
301 store <8 x i32> %interleaved.vec, ptr %ptr, align 4
305 define void @store_undef_mask_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
306 ; NEON-LABEL: @store_undef_mask_factor3(
307 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
308 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
309 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
310 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr %ptr)
311 ; NEON-NEXT: ret void
312 ; NO_NEON-LABEL: @store_undef_mask_factor3(
313 ; NO_NEON-NOT: @llvm.aarch64.neon
316 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
317 %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
318 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
319 store <12 x i32> %interleaved.vec, ptr %ptr, align 4
323 define void @store_undef_mask_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
324 ; NEON-LABEL: @store_undef_mask_factor4(
325 ; NEON: [[TMP1:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
326 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
327 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
328 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
329 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr %ptr)
330 ; NEON-NEXT: ret void
331 ; NO_NEON-LABEL: @store_undef_mask_factor4(
332 ; NO_NEON-NOT: @llvm.aarch64.neon
335 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
336 %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
337 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
338 store <16 x i32> %interleaved.vec, ptr %ptr, align 4
342 define void @load_illegal_factor2(ptr %ptr) nounwind {
343 ; NEON-LABEL: @load_illegal_factor2(
344 ; NEON-NOT: @llvm.aarch64.neon
346 ; NO_NEON-LABEL: @load_illegal_factor2(
347 ; NO_NEON-NOT: @llvm.aarch64.neon
350 %interleaved.vec = load <3 x float>, ptr %ptr, align 16
351 %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
355 define void @store_illegal_factor2(ptr %ptr, <3 x float> %v0) nounwind {
356 ; NEON-LABEL: @store_illegal_factor2(
357 ; NEON-NOT: @llvm.aarch64.neon
359 ; NO_NEON-LABEL: @store_illegal_factor2(
360 ; NO_NEON-NOT: @llvm.aarch64.neon
363 %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> poison, <3 x i32> <i32 0, i32 2, i32 undef>
364 store <3 x float> %interleaved.vec, ptr %ptr, align 16
368 define void @store_general_mask_factor4(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
369 ; NEON-LABEL: @store_general_mask_factor4(
370 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
371 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
372 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
373 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
374 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr %ptr)
375 ; NEON-NEXT: ret void
376 ; NO_NEON-LABEL: @store_general_mask_factor4(
377 ; NO_NEON-NOT: @llvm.aarch64.neon
380 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
381 store <8 x i32> %interleaved.vec, ptr %ptr, align 4
385 define void @store_general_mask_factor4_undefbeg(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
386 ; NEON-LABEL: @store_general_mask_factor4_undefbeg(
387 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
388 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
389 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
390 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
391 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr %ptr)
392 ; NEON-NEXT: ret void
393 ; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
394 ; NO_NEON-NOT: @llvm.aarch64.neon
397 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
398 store <8 x i32> %interleaved.vec, ptr %ptr, align 4
402 define void @store_general_mask_factor4_undefend(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
403 ; NEON-LABEL: @store_general_mask_factor4_undefend(
404 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
405 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
406 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
407 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
408 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr %ptr)
409 ; NEON-NEXT: ret void
410 ; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
411 ; NO_NEON-NOT: @llvm.aarch64.neon
414 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
415 store <8 x i32> %interleaved.vec, ptr %ptr, align 4
419 define void @store_general_mask_factor4_undefmid(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
420 ; NEON-LABEL: @store_general_mask_factor4_undefmid(
421 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
422 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
423 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
424 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
425 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr %ptr)
426 ; NEON-NEXT: ret void
427 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
428 ; NO_NEON-NOT: @llvm.aarch64.neon
431 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
432 store <8 x i32> %interleaved.vec, ptr %ptr, align 4
436 define void @store_general_mask_factor4_undefmulti(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
437 ; NEON-LABEL: @store_general_mask_factor4_undefmulti(
438 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
439 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
440 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
441 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
442 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr %ptr)
443 ; NEON-NEXT: ret void
444 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
445 ; NO_NEON-NOT: @llvm.aarch64.neon
448 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
449 store <8 x i32> %interleaved.vec, ptr %ptr, align 4
453 define void @store_general_mask_factor3(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
454 ; NEON-LABEL: @store_general_mask_factor3(
455 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
456 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
457 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
458 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr %ptr)
459 ; NEON-NEXT: ret void
460 ; NO_NEON-LABEL: @store_general_mask_factor3(
461 ; NO_NEON-NOT: @llvm.aarch64.neon
464 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
465 store <12 x i32> %interleaved.vec, ptr %ptr, align 4
469 define void @store_general_mask_factor3_undefmultimid(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
470 ; NEON-LABEL: @store_general_mask_factor3_undefmultimid(
471 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
472 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
473 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
474 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr %ptr)
475 ; NEON-NEXT: ret void
476 ; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
477 ; NO_NEON-NOT: @llvm.aarch64.neon
480 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
481 store <12 x i32> %interleaved.vec, ptr %ptr, align 4
485 define void @store_general_mask_factor3_undef_fail(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
486 ; NEON-LABEL: @store_general_mask_factor3_undef_fail(
487 ; NEON-NOT: @llvm.aarch64.neon
489 ; NO_NEON-LABEL: @store_general_mask_factor3_undef_fail(
490 ; NO_NEON-NOT: @llvm.aarch64.neon
493 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
494 store <12 x i32> %interleaved.vec, ptr %ptr, align 4
498 define void @store_general_mask_factor3_undeflane(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
499 ; NEON-LABEL: @store_general_mask_factor3_undeflane(
500 ; NEON-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
501 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
502 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
503 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr %ptr)
504 ; NEON-NEXT: ret void
505 ; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
506 ; NO_NEON-NOT: @llvm.aarch64.neon
509 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
510 store <12 x i32> %interleaved.vec, ptr %ptr, align 4
514 define void @store_general_mask_factor3_negativestart(ptr %ptr, <32 x i32> %v0, <32 x i32> %v1) {
515 ; NEON-LABEL: @store_general_mask_factor3_negativestart(
516 ; NEON-NOT: @llvm.aarch64.neon
518 ; NO_NEON-LABEL: @store_general_mask_factor3_negativestart(
519 ; NO_NEON-NOT: @llvm.aarch64.neon
522 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
523 store <12 x i32> %interleaved.vec, ptr %ptr, align 4
527 @g = external global <4 x float>
529 ; The following does not give a valid interleaved store
530 ; NEON-LABEL: define void @no_interleave
531 ; NEON-NOT: call void @llvm.aarch64.neon.st2
532 ; NEON: shufflevector
535 ; NO_NEON-LABEL: define void @no_interleave
536 ; NO_NEON: shufflevector
539 define void @no_interleave(<4 x float> %a0) {
540 %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 3, i32 7, i32 undef>
541 store <4 x float> %v0, ptr @g, align 16
545 define void @load_factor2_wide2(ptr %ptr) {
546 ; NEON-LABEL: @load_factor2_wide2(
547 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %ptr)
548 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
549 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
550 ; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr %ptr, i32 8
551 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[TMP5]])
552 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
553 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
554 ; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
555 ; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
556 ; NEON-NEXT: ret void
557 ; NO_NEON-LABEL: @load_factor2_wide2(
558 ; NO_NEON-NOT: @llvm.aarch64.neon
561 %interleaved.vec = load <16 x i32>, ptr %ptr, align 4
562 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
563 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
567 define void @load_factor2_wide3(ptr %ptr) {
568 ; NEON-LABEL: @load_factor2_wide3(
569 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR:%.*]])
570 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
571 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
572 ; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR]], i32 8
573 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[TMP5]])
574 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
575 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
576 ; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP5]], i32 8
577 ; NEON-NEXT: [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[TMP9]])
578 ; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1
579 ; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0
580 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
581 ; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
582 ; NEON-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
583 ; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
584 ; NEON-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
585 ; NEON-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
586 ; NEON-NEXT: ret void
587 ; NO_NEON-LABEL: @load_factor2_wide3(
588 ; NO_NEON-NOT: @llvm.aarch64.neon
591 %interleaved.vec = load <24 x i32>, ptr %ptr, align 4
592 %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
593 %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
597 define void @load_factor3_wide(ptr %ptr) {
598 ; NEON-LABEL: @load_factor3_wide(
599 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %ptr)
600 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
601 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
602 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
603 ; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr %ptr, i32 12
604 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[TMP6]])
605 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
606 ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
607 ; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
608 ; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
609 ; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
610 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
611 ; NEON-NEXT: ret void
612 ; NO_NEON-LABEL: @load_factor3_wide(
613 ; NO_NEON-NOT: @llvm.aarch64.neon
616 %interleaved.vec = load <24 x i32>, ptr %ptr, align 4
617 %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
618 %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
619 %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
623 define void @load_factor4_wide(ptr %ptr) {
624 ; NEON-LABEL: @load_factor4_wide(
625 ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %ptr)
626 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
627 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
628 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
629 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
630 ; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr %ptr, i32 16
631 ; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[TMP7]])
632 ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3
633 ; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
634 ; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
635 ; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
636 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
637 ; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
638 ; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
639 ; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
640 ; NEON-NEXT: ret void
641 ; NO_NEON-LABEL: @load_factor4_wide(
642 ; NO_NEON-NOT: @llvm.aarch64.neon
645 %interleaved.vec = load <32 x i32>, ptr %ptr, align 4
646 %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
647 %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
648 %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
649 %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
653 define void @store_factor2_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1) {
654 ; NEON-LABEL: @store_factor2_wide(
655 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
656 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
657 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr %ptr)
658 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
659 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
660 ; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr %ptr, i32 8
661 ; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], ptr [[TMP7]])
662 ; NEON-NEXT: ret void
663 ; NO_NEON-LABEL: @store_factor2_wide(
666 %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
667 store <16 x i32> %interleaved.vec, ptr %ptr, align 4
671 define void @store_factor3_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
672 ; NEON-LABEL: @store_factor3_wide(
673 ; NEON: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
674 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
675 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
676 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr %ptr)
677 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
678 ; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
679 ; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
680 ; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr %ptr, i32 12
681 ; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr [[TMP9]])
682 ; NEON-NEXT: ret void
683 ; NO_NEON-LABEL: @store_factor3_wide(
686 %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
687 %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
688 %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
689 store <24 x i32> %interleaved.vec, ptr %ptr, align 4
693 define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
694 ; NEON-LABEL: @store_factor4_wide(
695 ; NEON: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
696 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
697 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
698 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
699 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr %ptr)
700 ; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
701 ; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
702 ; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
703 ; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
704 ; NEON-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr %ptr, i32 16
705 ; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], ptr [[TMP11]])
706 ; NEON-NEXT: ret void
707 ; NO_NEON-LABEL: @store_factor4_wide(
708 ; NO_NEON-NOT: @llvm.aarch64.neon
711 %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
712 %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
713 %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
714 store <32 x i32> %interleaved.vec, ptr %ptr, align 4
718 define void @load_factor2_fp128(ptr %ptr) {
719 ; NEON-LABEL: @load_factor2_fp128(
720 ; NEON-NOT: @llvm.aarch64.neon
722 ; NO_NEON-LABEL: @load_factor2_fp128(
723 ; NO_NEON-NOT: @llvm.aarch64.neon
726 %interleaved.vec = load <4 x fp128>, ptr %ptr, align 16
727 %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 0, i32 2>
728 %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
732 define <4 x i1> @load_large_vector(ptr %p) {
733 ; NEON-LABEL: @load_large_vector(
734 ; NEON: [[LDN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr
735 ; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 1
736 ; NEON-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x ptr>
737 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN]], 0
738 ; NEON-NEXT: [[TMP4:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x ptr>
739 ; NEON: [[LDN1:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr
740 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 1
741 ; NEON-NEXT: [[TMP6:%.*]] = inttoptr <2 x i64> [[TMP5]] to <2 x ptr>
742 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[LDN1]], 0
743 ; NEON-NEXT: [[TMP8:%.*]] = inttoptr <2 x i64> [[TMP7]] to <2 x ptr>
744 ; NEON-NEXT: shufflevector <2 x ptr> [[TMP2]], <2 x ptr> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
745 ; NEON-NEXT: shufflevector <2 x ptr> [[TMP4]], <2 x ptr> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
746 ; NO_NEON-LABEL: @load_large_vector(
747 ; NO_NEON-NOT: @llvm.aarch64.neon
750 %l = load <12 x ptr>, ptr %p
751 %s1 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
752 %s2 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
753 %ret = icmp ne <4 x ptr> %s1, %s2