1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
5 ; CHECK-LABEL: @push_out_add_sub_block(
6 ; CHECK-NEXT: vector.ph:
7 ; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
10 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
11 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
12 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
13 ; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
15 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
16 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
17 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
18 ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
19 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
20 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
21 ; CHECK-NEXT: br label [[VECTOR_BODY_END]]
22 ; CHECK: vector.body.end:
23 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
24 ; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
26 ; CHECK-NEXT: ret void
32 vector.body: ; preds = %vector.body, %vector.ph
33 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
34 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
35 %0 = icmp eq i32 %index, 48
36 br i1 %0, label %lower.block, label %end
38 lower.block: ; preds = %vector.body
39 %1 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
40 %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
41 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
42 %3 = getelementptr inbounds i32, i32* %dst, i32 %index
43 %4 = bitcast i32* %3 to <4 x i32>*
44 store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
45 %index.next = add i32 %index, 4
46 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
47 br label %vector.body.end
49 vector.body.end: ; preds = %lower.block
50 %5 = icmp eq i32 %index.next, %n.vec
51 br i1 %5, label %end, label %vector.body
57 define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
58 ; CHECK-LABEL: @push_out_mul_sub_block(
59 ; CHECK-NEXT: vector.ph:
60 ; CHECK-NEXT: [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
61 ; CHECK-NEXT: [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
62 ; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
63 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
65 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
66 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
67 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
68 ; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
70 ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
71 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
72 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
73 ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
74 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
75 ; CHECK-NEXT: br label [[VECTOR_BODY_END]]
76 ; CHECK: vector.body.end:
77 ; CHECK-NEXT: [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
78 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
79 ; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
81 ; CHECK-NEXT: ret void
87 vector.body: ; preds = %vector.body, %vector.ph
88 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
89 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
90 %0 = icmp eq i32 %index, 48
91 br i1 %0, label %lower.block, label %end
93 lower.block: ; preds = %vector.body
94 %1 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
95 %2 = add <4 x i32> %1, <i32 6, i32 6, i32 6, i32 6>
96 %3 = getelementptr inbounds i32, i32* %data, <4 x i32> %2
97 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
98 %4 = getelementptr inbounds i32, i32* %dst, i32 %index
99 %5 = bitcast i32* %4 to <4 x i32>*
100 store <4 x i32> %wide.masked.gather, <4 x i32>* %5, align 4
101 %index.next = add i32 %index, 4
102 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
103 br label %vector.body.end
105 vector.body.end: ; preds = %lower.block
106 %6 = icmp eq i32 %index.next, %n.vec
107 br i1 %6, label %end, label %vector.body
114 define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
115 ; CHECK-LABEL: @push_out_mul_sub_loop(
116 ; CHECK-NEXT: vector.ph:
117 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
118 ; CHECK: vector.body:
119 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
120 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
121 ; CHECK-NEXT: br label [[VECTOR_2_PH:%.*]]
122 ; CHECK: vector.2.ph:
123 ; CHECK-NEXT: br label [[VECTOR_2_BODY:%.*]]
124 ; CHECK: vector.2.body:
125 ; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
126 ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], <i32 6, i32 6, i32 6, i32 6>
127 ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[TMP1]], i32 32, i32 2, i32 1)
128 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
129 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
130 ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4
131 ; CHECK-NEXT: br label [[VECTOR_2_BODY_END:%.*]]
132 ; CHECK: vector.2.body.end:
133 ; CHECK-NEXT: [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
134 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 16
135 ; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
136 ; CHECK: vector.body.end:
137 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
138 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
139 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
140 ; CHECK-NEXT: br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
142 ; CHECK-NEXT: ret void
146 br label %vector.body
148 vector.body: ; preds = %vector.body, %vector.ph
149 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
150 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
151 br label %vector.2.ph
154 br label %vector.2.body
156 vector.2.body: ; preds = %vector.body
157 %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
158 %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
159 %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
160 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
161 %3 = getelementptr inbounds i32, i32* %dst, i32 %index
162 %4 = bitcast i32* %3 to <4 x i32>*
163 store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
164 br label %vector.2.body.end
166 vector.2.body.end: ; preds = %lower.block
167 %index.2.next = add i32 %index, 4
168 %5 = icmp eq i32 %index.2.next, 16
169 br i1 %5, label %vector.body.end, label %vector.2.body
171 vector.body.end: ; preds = %lower.block
172 %index.next = add i32 %index, 4
173 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
174 %6 = icmp eq i32 %index.next, %n.vec
175 br i1 %6, label %end, label %vector.body
181 define arm_aapcs_vfpcc void @invariant_add(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
182 ; CHECK-LABEL: @invariant_add(
183 ; CHECK-NEXT: vector.ph:
184 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
185 ; CHECK: vector.body:
186 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
187 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
188 ; CHECK-NEXT: [[L0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
189 ; CHECK-NEXT: [[L1:%.*]] = add <4 x i32> [[L0]], [[VEC_IND]]
190 ; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[L1]], i32 32, i32 2, i32 1)
191 ; CHECK-NEXT: [[L3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
192 ; CHECK-NEXT: [[L4:%.*]] = bitcast i32* [[L3]] to <4 x i32>*
193 ; CHECK-NEXT: store <4 x i32> [[TMP0]], <4 x i32>* [[L4]], align 4
194 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
195 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
196 ; CHECK-NEXT: [[L5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
197 ; CHECK-NEXT: br i1 [[L5]], label [[END:%.*]], label [[VECTOR_BODY]]
199 ; CHECK-NEXT: ret void
203 br label %vector.body
205 vector.body: ; preds = %vector.body, %vector.ph
206 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
207 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
208 %l0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
209 %l1 = add <4 x i32> %l0, %vec.ind
210 %l2 = getelementptr inbounds i32, i32* %data, <4 x i32> %l1
211 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %l2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
212 %l3 = getelementptr inbounds i32, i32* %dst, i32 %index
213 %l4 = bitcast i32* %l3 to <4 x i32>*
214 store <4 x i32> %wide.masked.gather, <4 x i32>* %l4, align 4
215 %index.next = add i32 %index, 4
216 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
217 %l5 = icmp eq i32 %index.next, %n.vec
218 br i1 %l5, label %end, label %vector.body
225 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)