1 ; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefixes=NEON,ALL
2 ; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefixes=NO_NEON,ALL
4 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
5 target triple = "arm---eabi"
7 define void @load_factor2(<16 x i8>* %ptr) {
8 ; NEON-LABEL: @load_factor2(
9 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
10 ; NEON-NEXT: [[VLDN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* [[TMP1]], i32 4)
11 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 1
12 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLDN]], 0
14 ; NO_NEON-LABEL: @load_factor2(
15 ; NO_NEON-NOT: @llvm.arm.neon
18 %interleaved.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
19 %v0 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20 %v1 = shufflevector <16 x i8> %interleaved.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
24 define void @load_factor3(<6 x i32>* %ptr) {
25 ; NEON-LABEL: @load_factor3(
26 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <6 x i32>* %ptr to i8*
27 ; NEON-NEXT: [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
28 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
29 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
30 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
32 ; NO_NEON-LABEL: @load_factor3(
33 ; NO_NEON-NOT: @llvm.arm.neon
36 %interleaved.vec = load <6 x i32>, <6 x i32>* %ptr, align 4
37 %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
38 %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
39 %v2 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
43 define void @load_factor4(<16 x i32>* %ptr) {
44 ; NEON-LABEL: @load_factor4(
45 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
46 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
47 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
48 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
49 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
50 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
52 ; NO_NEON-LABEL: @load_factor4(
53 ; NO_NEON-NOT: @llvm.arm.neon
56 %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
57 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
58 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
59 %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
60 %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
64 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
65 ; NEON-LABEL: @store_factor2(
66 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* %ptr to i8*
67 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
68 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
69 ; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 4)
71 ; NO_NEON-LABEL: @store_factor2(
72 ; NO_NEON-NOT: @llvm.arm.neon
75 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
76 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
80 define void @store_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
81 ; NEON-LABEL: @store_factor3(
82 ; NEON: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
83 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
84 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
85 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
86 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
88 ; NO_NEON-LABEL: @store_factor3(
89 ; NO_NEON-NOT: @llvm.arm.neon
92 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
93 %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
94 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
95 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
99 define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
100 ; NEON-LABEL: @store_factor4(
101 ; NEON: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
102 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
103 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
104 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
105 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
106 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
107 ; NEON-NEXT: ret void
108 ; NO_NEON-LABEL: @store_factor4(
109 ; NO_NEON-NOT: @llvm.arm.neon
112 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
113 %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
114 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
115 store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
119 define void @load_ptrvec_factor2(<4 x i32*>* %ptr) {
120 ; NEON-LABEL: @load_ptrvec_factor2(
121 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <4 x i32*>* %ptr to i8*
122 ; NEON-NEXT: [[VLDN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
123 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLDN]], 0
124 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
125 ; NEON-NEXT: ret void
126 ; NO_NEON-LABEL: @load_ptrvec_factor2(
127 ; NO_NEON-NOT: @llvm.arm.neon
130 %interleaved.vec = load <4 x i32*>, <4 x i32*>* %ptr, align 4
131 %v0 = shufflevector <4 x i32*> %interleaved.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
135 define void @load_ptrvec_factor3(<6 x i32*>* %ptr) {
136 ; NEON-LABEL: @load_ptrvec_factor3(
137 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <6 x i32*>* %ptr to i8*
138 ; NEON-NEXT: [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
139 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
140 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
141 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
142 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
143 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
144 ; NEON-NEXT: [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
145 ; NEON-NEXT: ret void
146 ; NO_NEON-LABEL: @load_ptrvec_factor3(
147 ; NO_NEON-NOT: @llvm.arm.neon
150 %interleaved.vec = load <6 x i32*>, <6 x i32*>* %ptr, align 4
151 %v0 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 0, i32 3>
152 %v1 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
153 %v2 = shufflevector <6 x i32*> %interleaved.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
157 define void @load_ptrvec_factor4(<8 x i32*>* %ptr) {
158 ; NEON-LABEL: @load_ptrvec_factor4(
159 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to i8*
160 ; NEON-NEXT: [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
161 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 3
162 ; NEON-NEXT: [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x i32*>
163 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
164 ; NEON-NEXT: [[TMP5:%.*]] = inttoptr <2 x i32> [[TMP4]] to <2 x i32*>
165 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
166 ; NEON-NEXT: [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x i32*>
167 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
168 ; NEON-NEXT: [[TMP9:%.*]] = inttoptr <2 x i32> [[TMP8]] to <2 x i32*>
169 ; NEON-NEXT: ret void
170 ; NO_NEON-LABEL: @load_ptrvec_factor4(
171 ; NO_NEON-NOT: @llvm.arm.neon
174 %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4
175 %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 0, i32 4>
176 %v1 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
177 %v2 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 2, i32 6>
178 %v3 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
182 define void @store_ptrvec_factor2(<4 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
183 ; NEON-LABEL: @store_ptrvec_factor2(
184 ; NEON-NEXT: [[TMP1:%.*]] = ptrtoint <2 x i32*> %v0 to <2 x i32>
185 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <2 x i32*> %v1 to <2 x i32>
186 ; NEON-NEXT: [[TMP3:%.*]] = bitcast <4 x i32*>* %ptr to i8*
187 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
188 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
189 ; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
190 ; NEON-NEXT: ret void
191 ; NO_NEON-LABEL: @store_ptrvec_factor2(
192 ; NO_NEON-NOT: @llvm.arm.neon
195 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
196 store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr, align 4
200 define void @store_ptrvec_factor3(<6 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
201 ; NEON-LABEL: @store_ptrvec_factor3(
202 ; NEON: [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
203 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
204 ; NEON-NEXT: [[TMP3:%.*]] = bitcast <6 x i32*>* %ptr to i8*
205 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
206 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
207 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
208 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], i32 4)
209 ; NEON-NEXT: ret void
210 ; NO_NEON-LABEL: @store_ptrvec_factor3(
211 ; NO_NEON-NOT: @llvm.arm.neon
214 %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
215 %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
216 %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
217 store <6 x i32*> %interleaved.vec, <6 x i32*>* %ptr, align 4
221 define void @store_ptrvec_factor4(<8 x i32*>* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
222 ; NEON-LABEL: @store_ptrvec_factor4(
223 ; NEON: [[TMP1:%.*]] = ptrtoint <4 x i32*> %s0 to <4 x i32>
224 ; NEON-NEXT: [[TMP2:%.*]] = ptrtoint <4 x i32*> %s1 to <4 x i32>
225 ; NEON-NEXT: [[TMP3:%.*]] = bitcast <8 x i32*>* %ptr to i8*
226 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 0, i32 1>
227 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 2, i32 3>
228 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 4, i32 5>
229 ; NEON-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <2 x i32> <i32 6, i32 7>
230 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i32 4)
231 ; NEON-NEXT: ret void
232 ; NO_NEON-LABEL: @store_ptrvec_factor4(
233 ; NO_NEON-NOT: @llvm.arm.neon
236 %s0 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
237 %s1 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
238 %interleaved.vec = shufflevector <4 x i32*> %s0, <4 x i32*> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
239 store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr, align 4
243 define void @load_undef_mask_factor2(<8 x i32>* %ptr) {
244 ; NEON-LABEL: @load_undef_mask_factor2(
245 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
246 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
247 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
248 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
249 ; NEON-NEXT: ret void
250 ; NO_NEON-LABEL: @load_undef_mask_factor2(
251 ; NO_NEON-NOT: @llvm.arm.neon
254 %interleaved.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
255 %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
256 %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
260 define void @load_undef_mask_factor3(<12 x i32>* %ptr) {
261 ; NEON-LABEL: @load_undef_mask_factor3(
262 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
263 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
264 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
265 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
266 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
267 ; NEON-NEXT: ret void
268 ; NO_NEON-LABEL: @load_undef_mask_factor3(
269 ; NO_NEON-NOT: @llvm.arm.neon
272 %interleaved.vec = load <12 x i32>, <12 x i32>* %ptr, align 4
273 %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
274 %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
275 %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
279 define void @load_undef_mask_factor4(<16 x i32>* %ptr) {
280 ; NEON-LABEL: @load_undef_mask_factor4(
281 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
282 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
283 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
284 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
285 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
286 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
287 ; NEON-NEXT: ret void
288 ; NO_NEON-LABEL: @load_undef_mask_factor4(
289 ; NO_NEON-NOT: @llvm.arm.neon
292 %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
293 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
294 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
295 %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
296 %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
300 define void @store_undef_mask_factor2(<8 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
301 ; NEON-LABEL: @store_undef_mask_factor2(
302 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
303 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
304 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
305 ; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
306 ; NEON-NEXT: ret void
307 ; NO_NEON-LABEL: @store_undef_mask_factor2(
308 ; NO_NEON-NOT: @llvm.arm.neon
311 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
312 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
316 define void @store_undef_mask_factor3(<12 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
317 ; NEON-LABEL: @store_undef_mask_factor3(
318 ; NEON: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
319 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
320 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
321 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
322 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
323 ; NEON-NEXT: ret void
324 ; NO_NEON-LABEL: @store_undef_mask_factor3(
325 ; NO_NEON-NOT: @llvm.arm.neon
328 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
329 %s1 = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
330 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
331 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
335 define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
336 ; NEON-LABEL: @store_undef_mask_factor4(
337 ; NEON: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i8*
338 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
340 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
341 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> %s0, <8 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
342 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
343 ; NEON-NEXT: ret void
344 ; NO_NEON-LABEL: @store_undef_mask_factor4(
345 ; NO_NEON-NOT: @llvm.arm.neon
348 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
349 %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
350 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
351 store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
355 define void @load_address_space(<4 x i32> addrspace(1)* %ptr) {
356 ; NEON-LABEL: @load_address_space(
357 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
358 ; NEON-NEXT: [[VLDN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p1i8(i8 addrspace(1)* [[TMP1]], i32 0)
359 ; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 2
360 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 1
361 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLDN]], 0
362 ; NEON-NEXT: ret void
363 ; NO_NEON-LABEL: @load_address_space(
364 ; NO_NEON-NOT: @llvm.arm.neon
367 %interleaved.vec = load <4 x i32>, <4 x i32> addrspace(1)* %ptr
368 %v0 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <2 x i32> <i32 0, i32 3>
369 %v1 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <2 x i32> <i32 1, i32 4>
370 %v2 = shufflevector <4 x i32> %interleaved.vec, <4 x i32> undef, <2 x i32> <i32 2, i32 5>
374 define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2 x i32> %v1) {
375 ; NEON-LABEL: @store_address_space(
376 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> addrspace(1)* %ptr to i8 addrspace(1)*
377 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 1>
378 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 3>
379 ; NEON-NEXT: call void @llvm.arm.neon.vst2.p1i8.v2i32(i8 addrspace(1)* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 0)
380 ; NEON-NEXT: ret void
381 ; NO_NEON-LABEL: @store_address_space(
382 ; NO_NEON-NOT: @llvm.arm.neon
385 %interleaved.vec = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
386 store <4 x i32> %interleaved.vec, <4 x i32> addrspace(1)* %ptr
390 define void @load_f16_factor2(<8 x half>* %ptr) {
391 ; ALL-LABEL: @load_f16_factor2(
392 ; ALL-NOT: @llvm.arm.neon
395 %interleaved.vec = load <8 x half>, <8 x half>* %ptr, align 4
396 %v0 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
397 %v1 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
401 define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1) {
402 ; ALL-LABEL: @store_f16_factor2(
403 ; ALL-NOT: @llvm.arm.neon
406 %interleaved.vec = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
407 store <8 x half> %interleaved.vec, <8 x half>* %ptr, align 4
411 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
412 ; ALL-LABEL: @load_illegal_factor2(
413 ; ALL-NOT: @llvm.arm.neon
416 %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
417 %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
421 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
422 ; ALL-LABEL: @store_illegal_factor2(
423 ; ALL-NOT: @llvm.arm.neon
426 %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
427 store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
431 define void @store_general_mask_factor4(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
432 ; NEON-LABEL: @store_general_mask_factor4(
433 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
434 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
435 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
436 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
437 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
438 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
439 ; NEON-NEXT: ret void
440 ; NO_NEON-LABEL: @store_general_mask_factor4(
441 ; NO_NEON-NOT: @llvm.arm.neon
444 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
445 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
449 define void @store_general_mask_factor4_undefbeg(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
450 ; NEON-LABEL: @store_general_mask_factor4_undefbeg(
451 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
452 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
453 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
454 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
455 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
456 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
457 ; NEON-NEXT: ret void
458 ; NO_NEON-LABEL: @store_general_mask_factor4_undefbeg(
459 ; NO_NEON-NOT: @llvm.arm.neon
462 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
463 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
467 define void @store_general_mask_factor4_undefend(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
468 ; NEON-LABEL: @store_general_mask_factor4_undefend(
469 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
470 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
471 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
472 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
473 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
474 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
475 ; NEON-NEXT: ret void
476 ; NO_NEON-LABEL: @store_general_mask_factor4_undefend(
477 ; NO_NEON-NOT: @llvm.arm.neon
480 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
481 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
485 define void @store_general_mask_factor4_undefmid(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
486 ; NEON-LABEL: @store_general_mask_factor4_undefmid(
487 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
488 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
489 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 16, i32 17>
490 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 32, i32 33>
491 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
492 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
493 ; NEON-NEXT: ret void
494 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmid(
495 ; NO_NEON-NOT: @llvm.arm.neon
498 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
499 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
503 define void @store_general_mask_factor4_undefmulti(<8 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
504 ; NEON-LABEL: @store_general_mask_factor4_undefmulti(
505 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <8 x i32>* %ptr to i8*
506 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 4, i32 5>
507 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
508 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 0, i32 1>
509 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <2 x i32> <i32 8, i32 9>
510 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
511 ; NEON-NEXT: ret void
512 ; NO_NEON-LABEL: @store_general_mask_factor4_undefmulti(
513 ; NO_NEON-NOT: @llvm.arm.neon
516 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
517 store <8 x i32> %interleaved.vec, <8 x i32>* %ptr, align 4
521 define void @store_general_mask_factor3(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
522 ; NEON-LABEL: @store_general_mask_factor3(
523 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
524 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
525 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
526 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
527 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
528 ; NEON-NEXT: ret void
529 ; NO_NEON-LABEL: @store_general_mask_factor3(
530 ; NO_NEON-NOT: @llvm.arm.neon
533 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
534 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
538 define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
539 ; NEON-LABEL: @store_general_mask_factor3_undefmultimid(
540 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
541 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
542 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
543 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
544 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
545 ; NEON-NEXT: ret void
546 ; NO_NEON-LABEL: @store_general_mask_factor3_undefmultimid(
547 ; NO_NEON-NOT: @llvm.arm.neon
550 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
551 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
555 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
556 ; ALL-LABEL: @store_general_mask_factor3_undef_fail(
557 ; ALL-NOT: @llvm.arm.neon
560 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
561 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
565 define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
566 ; NEON-LABEL: @store_general_mask_factor3_undeflane(
567 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
568 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
569 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
570 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
571 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
572 ; NEON-NEXT: ret void
573 ; NO_NEON-LABEL: @store_general_mask_factor3_undeflane(
574 ; NO_NEON-NOT: @llvm.arm.neon
577 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
578 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
582 define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
583 ; ALL-LABEL: @store_general_mask_factor3_endstart_fail(
584 ; ALL-NOT: @llvm.arm.neon
587 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
588 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
592 define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
593 ; NEON-LABEL: @store_general_mask_factor3_endstart_pass(
594 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
595 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
596 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
597 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
598 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
599 ; NEON-NEXT: ret void
600 ; NO_NEON-LABEL: @store_general_mask_factor3_endstart_pass(
601 ; NO_NEON-NOT: @llvm.arm.neon
604 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
605 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
609 define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
610 ; ALL-LABEL: @store_general_mask_factor3_midstart_fail(
611 ; ALL-NOT: @llvm.arm.neon
614 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
615 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
619 define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
620 ; NEON-LABEL: @store_general_mask_factor3_midstart_pass(
621 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <12 x i32>* %ptr to i8*
622 ; NEON-NEXT: [[TMP2:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
623 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
624 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <32 x i32> %v0, <32 x i32> %v1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
625 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
626 ; NEON-NEXT: ret void
627 ; NO_NEON-LABEL: @store_general_mask_factor3_midstart_pass(
628 ; NO_NEON-NOT: @llvm.arm.neon
631 %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
632 store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
636 @g = external global <4 x float>
638 ; The following does not give a valid interleaved store
639 ; ALL-LABEL: define void @no_interleave
640 ; ALL-NOT: call void @llvm.arm.neon.vst2
644 define void @no_interleave(<4 x float> %a0) {
645 %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
646 store <4 x float> %v0, <4 x float>* @g, align 16
650 define void @load_factor2_wide2(<16 x i32>* %ptr) {
651 ; NEON-LABEL: @load_factor2_wide2(
652 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
653 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
654 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
655 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
656 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
657 ; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
658 ; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
659 ; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
660 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
661 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
662 ; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
663 ; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
664 ; NEON-NEXT: ret void
665 ; NO_NEON-LABEL: @load_factor2_wide2(
666 ; NO_NEON-NOT: @llvm.arm.neon
669 %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
670 %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
671 %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
675 define void @load_factor2_wide3(<24 x i32>* %ptr) {
676 ; NEON-LABEL: @load_factor2_wide3(
677 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
678 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
679 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
680 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
681 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
682 ; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
683 ; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
684 ; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
685 ; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
686 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
687 ; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
688 ; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
689 ; NEON-NEXT: [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
690 ; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
691 ; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
692 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
693 ; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
694 ; NEON-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
695 ; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
696 ; NEON-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
697 ; NEON-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
698 ; NEON-NEXT: ret void
699 ; NO_NEON-LABEL: @load_factor2_wide3(
700 ; NO_NEON-NOT: @llvm.arm.neon
703 %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
704 %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
705 %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
709 define void @load_factor3_wide(<24 x i32>* %ptr) {
710 ; NEON-LABEL: @load_factor3_wide(
711 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
712 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
713 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
714 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
715 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
716 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
717 ; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
718 ; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
719 ; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
720 ; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
721 ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
722 ; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
723 ; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
724 ; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
725 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
726 ; NEON-NEXT: ret void
727 ; NO_NEON-LABEL: @load_factor3_wide(
728 ; NO_NEON-NOT: @llvm.arm.neon
731 %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
732 %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
733 %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
734 %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
738 define void @load_factor4_wide(<32 x i32>* %ptr) {
739 ; NEON-LABEL: @load_factor4_wide(
740 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
741 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
742 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
743 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
744 ; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
745 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
746 ; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
747 ; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
748 ; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
749 ; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
750 ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
751 ; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
752 ; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
753 ; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
754 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
755 ; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
756 ; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
757 ; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
758 ; NEON-NEXT: ret void
759 ; NO_NEON-LABEL: @load_factor4_wide(
760 ; NO_NEON-NOT: @llvm.arm.neon
763 %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
764 %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
765 %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
766 %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
767 %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
771 define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
772 ; NEON-LABEL: @store_factor2_wide(
773 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
774 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
775 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
776 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
777 ; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
778 ; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
779 ; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
780 ; NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
781 ; NEON-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
782 ; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
783 ; NEON-NEXT: ret void
784 ; NO_NEON-LABEL: @store_factor2_wide(
785 ; NO_NEON-NOT: @llvm.arm.neon
788 %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
789 store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
793 define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
794 ; NEON-LABEL: @store_factor3_wide(
795 ; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
796 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
797 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
798 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
799 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
800 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
801 ; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
802 ; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
803 ; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
804 ; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
805 ; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
806 ; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
807 ; NEON-NEXT: ret void
808 ; NO_NEON-LABEL: @store_factor3_wide(
809 ; NO_NEON-NOT: @llvm.arm.neon
812 %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
813 %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
814 %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
815 store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
819 define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
820 ; NEON-LABEL: @store_factor4_wide(
821 ; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
822 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
823 ; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
824 ; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
825 ; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
826 ; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
827 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
828 ; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
829 ; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
830 ; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
831 ; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
832 ; NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
833 ; NEON-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
834 ; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
835 ; NEON-NEXT: ret void
836 ; NO_NEON-LABEL: @store_factor4_wide(
837 ; NO_NEON-NOT: @llvm.arm.neon
840 %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
841 %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
842 %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
843 store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
847 define void @load_factor2_fp128(<4 x fp128>* %ptr) {
848 ; ALL-LABEL: @load_factor2_fp128(
849 ; ALL-NOT: @llvm.arm.neon
852 %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
853 %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
854 %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
858 define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) {
859 ; NEON-LABEL: @load_factor2_wide_pointer(
860 ; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32*>* %ptr to i32*
861 ; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
862 ; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
863 ; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
864 ; NEON-NEXT: [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*>
865 ; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
866 ; NEON-NEXT: [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*>
867 ; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
868 ; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
869 ; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4)
870 ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
871 ; NEON-NEXT: [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*>
872 ; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
873 ; NEON-NEXT: [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*>
874 ; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
875 ; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
876 ; NEON-NEXT: ret void
877 ; NO_NEON-LABEL: @load_factor2_wide_pointer(
878 ; NO_NEON-NOT: @llvm.arm.neon
881 %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4
882 %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
883 %v1 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>