1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
5 define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, ptr %dst, <4 x i32> %offs) {
6 ; CHECK-LABEL: scatter_inc_minipred_4i32:
8 ; CHECK-NEXT: movs r1, #4
9 ; CHECK-NEXT: movw r2, #3855
10 ; CHECK-NEXT: vadd.i32 q1, q1, r1
11 ; CHECK-NEXT: vmsr p0, r2
13 ; CHECK-NEXT: vstrwt.32 q0, [r0, q1, uxtw #2]
15 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
16 %2 = getelementptr inbounds i32, ptr %dst, <4 x i32> %1
17 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
21 define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, ptr %dst, <8 x i32> %offs) {
22 ; CHECK-LABEL: scatter_inc_mini_8i16:
24 ; CHECK-NEXT: .save {r4, r5, r6, lr}
25 ; CHECK-NEXT: push {r4, r5, r6, lr}
26 ; CHECK-NEXT: vshl.i32 q1, q1, #1
27 ; CHECK-NEXT: mov.w r12, #16
28 ; CHECK-NEXT: vadd.i32 q1, q1, r0
29 ; CHECK-NEXT: vmov.u16 r6, q0[0]
30 ; CHECK-NEXT: vadd.i32 q1, q1, r12
31 ; CHECK-NEXT: vmov r2, r3, d2
32 ; CHECK-NEXT: vmov r1, lr, d3
33 ; CHECK-NEXT: vshl.i32 q1, q2, #1
34 ; CHECK-NEXT: vadd.i32 q1, q1, r0
35 ; CHECK-NEXT: vadd.i32 q1, q1, r12
36 ; CHECK-NEXT: vmov r0, r12, d2
37 ; CHECK-NEXT: vmov r4, r5, d3
38 ; CHECK-NEXT: strh r6, [r2]
39 ; CHECK-NEXT: vmov.u16 r2, q0[1]
40 ; CHECK-NEXT: strh r2, [r3]
41 ; CHECK-NEXT: vmov.u16 r2, q0[2]
42 ; CHECK-NEXT: strh r2, [r1]
43 ; CHECK-NEXT: vmov.u16 r1, q0[3]
44 ; CHECK-NEXT: strh.w r1, [lr]
45 ; CHECK-NEXT: vmov.u16 r1, q0[4]
46 ; CHECK-NEXT: strh r1, [r0]
47 ; CHECK-NEXT: vmov.u16 r0, q0[5]
48 ; CHECK-NEXT: strh.w r0, [r12]
49 ; CHECK-NEXT: vmov.u16 r0, q0[6]
50 ; CHECK-NEXT: strh r0, [r4]
51 ; CHECK-NEXT: vmov.u16 r0, q0[7]
52 ; CHECK-NEXT: strh r0, [r5]
53 ; CHECK-NEXT: pop {r4, r5, r6, pc}
54 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
55 %2 = getelementptr inbounds i16, ptr %dst, <8 x i32> %1
56 call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %data, <8 x ptr> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
60 define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, ptr %dst, <16 x i32> %offs) {
61 ; CHECK-LABEL: scatter_inc_mini_16i8:
63 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
64 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
66 ; CHECK-NEXT: sub sp, #4
67 ; CHECK-NEXT: movs r1, #16
68 ; CHECK-NEXT: vadd.i32 q1, q1, r0
69 ; CHECK-NEXT: vadd.i32 q1, q1, r1
70 ; CHECK-NEXT: add.w r12, sp, #32
71 ; CHECK-NEXT: vmov r2, r3, d2
72 ; CHECK-NEXT: vadd.i32 q3, q3, r0
73 ; CHECK-NEXT: vmov lr, r5, d3
74 ; CHECK-NEXT: vadd.i32 q1, q2, r0
75 ; CHECK-NEXT: vadd.i32 q2, q1, r1
76 ; CHECK-NEXT: vldrw.u32 q1, [r12]
77 ; CHECK-NEXT: vmov r4, r12, d4
78 ; CHECK-NEXT: vmov.u8 r6, q0[0]
79 ; CHECK-NEXT: vadd.i32 q1, q1, r0
80 ; CHECK-NEXT: vmov r0, r8, d5
81 ; CHECK-NEXT: vadd.i32 q3, q3, r1
82 ; CHECK-NEXT: vadd.i32 q1, q1, r1
83 ; CHECK-NEXT: vmov.u8 r1, q0[4]
84 ; CHECK-NEXT: vmov.u8 r7, q0[6]
85 ; CHECK-NEXT: strb r6, [r2]
86 ; CHECK-NEXT: vmov.u8 r2, q0[1]
87 ; CHECK-NEXT: strb r2, [r3]
88 ; CHECK-NEXT: vmov.u8 r6, q0[2]
89 ; CHECK-NEXT: vmov r2, r9, d6
90 ; CHECK-NEXT: strb.w r6, [lr]
91 ; CHECK-NEXT: vmov.u8 r6, q0[3]
92 ; CHECK-NEXT: vmov.u8 r3, q0[8]
93 ; CHECK-NEXT: strb r6, [r5]
94 ; CHECK-NEXT: vmov r6, r5, d7
95 ; CHECK-NEXT: strb r1, [r4]
96 ; CHECK-NEXT: vmov.u8 r1, q0[5]
97 ; CHECK-NEXT: strb.w r1, [r12]
98 ; CHECK-NEXT: vmov r1, r4, d2
99 ; CHECK-NEXT: strb r7, [r0]
100 ; CHECK-NEXT: vmov.u8 r0, q0[7]
101 ; CHECK-NEXT: strb.w r0, [r8]
102 ; CHECK-NEXT: vmov r0, r7, d3
103 ; CHECK-NEXT: strb r3, [r2]
104 ; CHECK-NEXT: vmov.u8 r2, q0[9]
105 ; CHECK-NEXT: strb.w r2, [r9]
106 ; CHECK-NEXT: vmov.u8 r2, q0[10]
107 ; CHECK-NEXT: strb r2, [r6]
108 ; CHECK-NEXT: vmov.u8 r2, q0[11]
109 ; CHECK-NEXT: strb r2, [r5]
110 ; CHECK-NEXT: vmov.u8 r2, q0[12]
111 ; CHECK-NEXT: strb r2, [r1]
112 ; CHECK-NEXT: vmov.u8 r1, q0[13]
113 ; CHECK-NEXT: strb r1, [r4]
114 ; CHECK-NEXT: vmov.u8 r1, q0[14]
115 ; CHECK-NEXT: strb r1, [r0]
116 ; CHECK-NEXT: vmov.u8 r0, q0[15]
117 ; CHECK-NEXT: strb r0, [r7]
118 ; CHECK-NEXT: add sp, #4
119 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
120 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
121 %2 = getelementptr inbounds i8, ptr %dst, <16 x i32> %1
122 call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %data, <16 x ptr> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
126 define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, ptr %dst, i32 %n) {
127 ; CHECK-LABEL: scatter_inc_v4i32_complex:
128 ; CHECK: @ %bb.0: @ %entry
129 ; CHECK-NEXT: cmp r1, #1
131 ; CHECK-NEXT: bxlt lr
132 ; CHECK-NEXT: .LBB3_1: @ %vector.ph.preheader
133 ; CHECK-NEXT: .save {r4, lr}
134 ; CHECK-NEXT: push {r4, lr}
135 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
136 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
137 ; CHECK-NEXT: .pad #16
138 ; CHECK-NEXT: sub sp, #16
139 ; CHECK-NEXT: adr r4, .LCPI3_2
140 ; CHECK-NEXT: bic r2, r1, #3
141 ; CHECK-NEXT: vldrw.u32 q3, [r4]
142 ; CHECK-NEXT: sub.w r12, r2, #4
143 ; CHECK-NEXT: adr.w lr, .LCPI3_1
144 ; CHECK-NEXT: movs r3, #1
145 ; CHECK-NEXT: vadd.i32 q3, q3, r0
146 ; CHECK-NEXT: add.w r3, r3, r12, lsr #2
147 ; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
148 ; CHECK-NEXT: vldrw.u32 q3, [lr]
149 ; CHECK-NEXT: adr.w r12, .LCPI3_0
150 ; CHECK-NEXT: vadd.i32 q4, q3, r0
151 ; CHECK-NEXT: vldrw.u32 q3, [r12]
152 ; CHECK-NEXT: vadd.i32 q3, q3, r0
153 ; CHECK-NEXT: .LBB3_2: @ %vector.ph
154 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
155 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
156 ; CHECK-NEXT: dls lr, r3
157 ; CHECK-NEXT: vmov q6, q4
158 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
159 ; CHECK-NEXT: vmov q5, q3
160 ; CHECK-NEXT: .LBB3_3: @ %vector.body
161 ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
162 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
163 ; CHECK-NEXT: vstrw.32 q0, [q5, #48]!
164 ; CHECK-NEXT: vstrw.32 q1, [q6, #48]!
165 ; CHECK-NEXT: vstrw.32 q2, [q7, #48]!
166 ; CHECK-NEXT: le lr, .LBB3_3
167 ; CHECK-NEXT: @ %bb.4: @ %middle.block
168 ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
169 ; CHECK-NEXT: cmp r2, r1
170 ; CHECK-NEXT: bne .LBB3_2
171 ; CHECK-NEXT: @ %bb.5:
172 ; CHECK-NEXT: add sp, #16
173 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
174 ; CHECK-NEXT: pop.w {r4, lr}
176 ; CHECK-NEXT: .p2align 4
177 ; CHECK-NEXT: @ %bb.6:
178 ; CHECK-NEXT: .LCPI3_0:
179 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
180 ; CHECK-NEXT: .long 4294967260 @ 0xffffffdc
181 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
182 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4
183 ; CHECK-NEXT: .LCPI3_1:
184 ; CHECK-NEXT: .long 4294967252 @ 0xffffffd4
185 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
186 ; CHECK-NEXT: .long 4294967276 @ 0xffffffec
187 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
188 ; CHECK-NEXT: .LCPI3_2:
189 ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8
190 ; CHECK-NEXT: .long 4294967268 @ 0xffffffe4
191 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
192 ; CHECK-NEXT: .long 4294967292 @ 0xfffffffc
194 %cmp22 = icmp sgt i32 %n, 0
195 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
197 vector.ph: ; preds = %for.body.preheader
198 %n.vec = and i32 %n, -4
199 br label %vector.body
201 vector.body: ; preds = %vector.body, %vector.ph
202 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
203 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
204 %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
205 %1 = getelementptr inbounds i32, ptr %dst, <4 x i32> %0
206 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data1, <4 x ptr> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
207 %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
208 %3 = getelementptr inbounds i32, ptr %dst, <4 x i32> %2
209 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data2, <4 x ptr> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
210 %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
211 %5 = getelementptr inbounds i32, ptr %dst, <4 x i32> %4
212 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data3, <4 x ptr> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
213 %index.next = add i32 %index, 4
214 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
215 %6 = icmp eq i32 %index.next, %n.vec
216 br i1 %6, label %middle.block, label %vector.body
218 middle.block: ; preds = %vector.body
219 %cmp.n = icmp eq i32 %n.vec, %n
220 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
222 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
226 define void @shl(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n) {
228 ; CHECK: @ %bb.0: @ %entry
229 ; CHECK-NEXT: .save {r7, lr}
230 ; CHECK-NEXT: push {r7, lr}
231 ; CHECK-NEXT: cmp r2, #1
233 ; CHECK-NEXT: poplt {r7, pc}
234 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
235 ; CHECK-NEXT: adr r3, .LCPI4_0
236 ; CHECK-NEXT: vldrw.u32 q0, [r3]
237 ; CHECK-NEXT: vadd.i32 q0, q0, r1
238 ; CHECK-NEXT: dlstp.32 lr, r2
239 ; CHECK-NEXT: .LBB4_2: @ %vector.body
240 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
241 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
242 ; CHECK-NEXT: vstrw.32 q1, [q0, #64]!
243 ; CHECK-NEXT: letp lr, .LBB4_2
244 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
245 ; CHECK-NEXT: pop {r7, pc}
246 ; CHECK-NEXT: .p2align 4
247 ; CHECK-NEXT: @ %bb.4:
248 ; CHECK-NEXT: .LCPI4_0:
249 ; CHECK-NEXT: .long 4294967232 @ 0xffffffc0
250 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
251 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
252 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
254 %cmp6 = icmp sgt i32 %n, 0
255 br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
257 vector.ph: ; preds = %entry
258 %n.rnd.up = add i32 %n, 3
259 %n.vec = and i32 %n.rnd.up, -4
260 br label %vector.body
262 vector.body: ; preds = %vector.body, %vector.ph
263 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
264 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
265 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
266 %0 = getelementptr inbounds i32, ptr %x, i32 %index
267 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
268 %1 = shl nsw <4 x i32> %vec.ind, <i32 2, i32 2, i32 2, i32 2>
269 %2 = getelementptr inbounds i32, ptr %y, <4 x i32> %1
270 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %wide.masked.load, <4 x ptr> %2, i32 4, <4 x i1> %active.lane.mask)
271 %index.next = add i32 %index, 4
272 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
273 %3 = icmp eq i32 %index.next, %n.vec
274 br i1 %3, label %for.cond.cleanup, label %vector.body
276 for.cond.cleanup: ; preds = %vector.body, %entry
280 define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n) {
281 ; CHECK-LABEL: shlor:
282 ; CHECK: @ %bb.0: @ %entry
283 ; CHECK-NEXT: .save {r4, r5, r6, lr}
284 ; CHECK-NEXT: push {r4, r5, r6, lr}
285 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
286 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
287 ; CHECK-NEXT: cmp r2, #1
288 ; CHECK-NEXT: blt .LBB5_3
289 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
290 ; CHECK-NEXT: adr.w lr, .LCPI5_0
291 ; CHECK-NEXT: adr r4, .LCPI5_1
292 ; CHECK-NEXT: adr r5, .LCPI5_2
293 ; CHECK-NEXT: adr r6, .LCPI5_3
294 ; CHECK-NEXT: vldrw.u32 q2, [r4]
295 ; CHECK-NEXT: vldrw.u32 q0, [r6]
296 ; CHECK-NEXT: vldrw.u32 q1, [r5]
297 ; CHECK-NEXT: vldrw.u32 q3, [lr]
298 ; CHECK-NEXT: vadd.i32 q0, q0, r1
299 ; CHECK-NEXT: vadd.i32 q1, q1, r1
300 ; CHECK-NEXT: vadd.i32 q2, q2, r1
301 ; CHECK-NEXT: vadd.i32 q3, q3, r1
302 ; CHECK-NEXT: mov.w r12, #1
303 ; CHECK-NEXT: movs r4, #3
304 ; CHECK-NEXT: movs r3, #2
305 ; CHECK-NEXT: movs r1, #4
306 ; CHECK-NEXT: dlstp.32 lr, r2
307 ; CHECK-NEXT: .LBB5_2: @ %vector.body
308 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
309 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16
310 ; CHECK-NEXT: vadd.i32 q6, q4, r12
311 ; CHECK-NEXT: vadd.i32 q5, q4, r1
312 ; CHECK-NEXT: vstrw.32 q6, [q3, #128]!
313 ; CHECK-NEXT: vadd.i32 q6, q4, r3
314 ; CHECK-NEXT: vadd.i32 q4, q4, r4
315 ; CHECK-NEXT: vstrw.32 q6, [q2, #128]!
316 ; CHECK-NEXT: vstrw.32 q4, [q1, #128]!
317 ; CHECK-NEXT: vstrw.32 q5, [q0, #128]!
318 ; CHECK-NEXT: letp lr, .LBB5_2
319 ; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup
320 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
321 ; CHECK-NEXT: pop {r4, r5, r6, pc}
322 ; CHECK-NEXT: .p2align 4
323 ; CHECK-NEXT: @ %bb.4:
324 ; CHECK-NEXT: .LCPI5_0:
325 ; CHECK-NEXT: .long 4294967168 @ 0xffffff80
326 ; CHECK-NEXT: .long 4294967200 @ 0xffffffa0
327 ; CHECK-NEXT: .long 4294967232 @ 0xffffffc0
328 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
329 ; CHECK-NEXT: .LCPI5_1:
330 ; CHECK-NEXT: .long 4294967176 @ 0xffffff88
331 ; CHECK-NEXT: .long 4294967208 @ 0xffffffa8
332 ; CHECK-NEXT: .long 4294967240 @ 0xffffffc8
333 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
334 ; CHECK-NEXT: .LCPI5_2:
335 ; CHECK-NEXT: .long 4294967184 @ 0xffffff90
336 ; CHECK-NEXT: .long 4294967216 @ 0xffffffb0
337 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
338 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
339 ; CHECK-NEXT: .LCPI5_3:
340 ; CHECK-NEXT: .long 4294967192 @ 0xffffff98
341 ; CHECK-NEXT: .long 4294967224 @ 0xffffffb8
342 ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8
343 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
345 %cmp33 = icmp sgt i32 %n, 0
346 br i1 %cmp33, label %vector.ph, label %for.cond.cleanup
348 vector.ph: ; preds = %entry
349 %n.rnd.up = add i32 %n, 3
350 %n.vec = and i32 %n.rnd.up, -4
351 br label %vector.body
353 vector.body: ; preds = %vector.body, %vector.ph
354 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
355 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
356 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
357 %0 = getelementptr inbounds i32, ptr %x, i32 %index
358 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
359 %1 = add nsw <4 x i32> %wide.masked.load, <i32 1, i32 1, i32 1, i32 1>
360 %2 = shl nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
361 %3 = getelementptr inbounds i32, ptr %y, <4 x i32> %2
362 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %1, <4 x ptr> %3, i32 4, <4 x i1> %active.lane.mask)
363 %4 = add nsw <4 x i32> %wide.masked.load, <i32 2, i32 2, i32 2, i32 2>
364 %5 = or <4 x i32> %2, <i32 2, i32 2, i32 2, i32 2>
365 %6 = getelementptr inbounds i32, ptr %y, <4 x i32> %5
366 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %4, <4 x ptr> %6, i32 4, <4 x i1> %active.lane.mask)
367 %7 = add nsw <4 x i32> %wide.masked.load, <i32 3, i32 3, i32 3, i32 3>
368 %8 = or <4 x i32> %2, <i32 4, i32 4, i32 4, i32 4>
369 %9 = getelementptr inbounds i32, ptr %y, <4 x i32> %8
370 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %7, <4 x ptr> %9, i32 4, <4 x i1> %active.lane.mask)
371 %10 = add nsw <4 x i32> %wide.masked.load, <i32 4, i32 4, i32 4, i32 4>
372 %11 = or <4 x i32> %2, <i32 6, i32 6, i32 6, i32 6>
373 %12 = getelementptr inbounds i32, ptr %y, <4 x i32> %11
374 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %10, <4 x ptr> %12, i32 4, <4 x i1> %active.lane.mask)
375 %index.next = add i32 %index, 4
376 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
377 %13 = icmp eq i32 %index.next, %n.vec
378 br i1 %13, label %for.cond.cleanup, label %vector.body
380 for.cond.cleanup: ; preds = %vector.body, %entry
384 declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
385 declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
386 declare void @llvm.masked.scatter.v8f16.v8p0(<8 x half>, <8 x ptr>, i32, <8 x i1>)
387 declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
388 declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32, <4 x i1>)
389 declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
390 declare void @llvm.masked.scatter.v4f16.v4p0(<4 x half>, <4 x ptr>, i32, <4 x i1>)
391 declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
392 declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
393 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
394 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)