1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
4 %struct.arm_2d_size_t = type { i16, i16 }
5 define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) {
6 ; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: ldrsh.w r12, [r2, #2]
9 ; CHECK-NEXT: cmp.w r12, #1
12 ; CHECK-NEXT: .LBB0_1: @ %for.cond3.preheader.lr.ph
13 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
14 ; CHECK-NEXT: sub sp, #4
15 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
16 ; CHECK-NEXT: sub sp, #64
17 ; CHECK-NEXT: ldrsh.w r7, [r2]
18 ; CHECK-NEXT: cmp r7, #1
19 ; CHECK-NEXT: blt.w .LBB0_6
20 ; CHECK-NEXT: @ %bb.2: @ %for.cond3.preheader.us.preheader
21 ; CHECK-NEXT: movs r2, #252
22 ; CHECK-NEXT: ldr r4, [sp, #152]
23 ; CHECK-NEXT: and.w r6, r2, r3, lsr #3
24 ; CHECK-NEXT: movs r2, #120
25 ; CHECK-NEXT: and.w r5, r2, r3, lsr #9
26 ; CHECK-NEXT: lsls r3, r3, #3
27 ; CHECK-NEXT: uxtb r3, r3
28 ; CHECK-NEXT: muls r6, r4, r6
29 ; CHECK-NEXT: rsb.w r2, r4, #256
30 ; CHECK-NEXT: vmov.i16 q2, #0xfc
31 ; CHECK-NEXT: mul lr, r5, r4
32 ; CHECK-NEXT: vdup.16 q4, r6
33 ; CHECK-NEXT: mov.w r6, #2016
34 ; CHECK-NEXT: vmov.i16 q6, #0xf8
35 ; CHECK-NEXT: mul r5, r3, r4
36 ; CHECK-NEXT: adds r3, r7, #7
37 ; CHECK-NEXT: bic r3, r3, #7
38 ; CHECK-NEXT: vdup.16 q3, lr
39 ; CHECK-NEXT: subs r3, #8
40 ; CHECK-NEXT: movs r4, #1
41 ; CHECK-NEXT: vdup.16 q0, r5
42 ; CHECK-NEXT: lsls r1, r1, #1
43 ; CHECK-NEXT: add.w r3, r4, r3, lsr #3
44 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
45 ; CHECK-NEXT: vmov.i16 q0, #0xf800
46 ; CHECK-NEXT: movs r4, #0
47 ; CHECK-NEXT: vdup.16 q5, r6
48 ; CHECK-NEXT: vmov.i16 q7, #0x78
49 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
50 ; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
51 ; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
52 ; CHECK-NEXT: .LBB0_3: @ %vector.ph
53 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
54 ; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
55 ; CHECK-NEXT: mov r5, r0
56 ; CHECK-NEXT: mov r6, r7
57 ; CHECK-NEXT: dls lr, r3
58 ; CHECK-NEXT: .LBB0_4: @ %vector.body
59 ; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1
60 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
61 ; CHECK-NEXT: vctp.16 r6
62 ; CHECK-NEXT: subs r6, #8
64 ; CHECK-NEXT: vldrht.u16 q0, [r5]
65 ; CHECK-NEXT: vshr.u16 q1, q0, #3
66 ; CHECK-NEXT: vand q1, q1, q2
67 ; CHECK-NEXT: vmov q2, q4
68 ; CHECK-NEXT: vmla.i16 q2, q1, r2
69 ; CHECK-NEXT: vshr.u16 q1, q2, #5
70 ; CHECK-NEXT: vshl.i16 q2, q0, #3
71 ; CHECK-NEXT: vand q3, q1, q5
72 ; CHECK-NEXT: vmov q1, q7
73 ; CHECK-NEXT: vand q2, q2, q6
74 ; CHECK-NEXT: vmov q7, q6
75 ; CHECK-NEXT: vmov q6, q5
76 ; CHECK-NEXT: vmov q5, q4
77 ; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload
78 ; CHECK-NEXT: vshr.u16 q0, q0, #9
79 ; CHECK-NEXT: vmla.i16 q4, q2, r2
80 ; CHECK-NEXT: vshr.u16 q2, q4, #11
81 ; CHECK-NEXT: vmov q4, q5
82 ; CHECK-NEXT: vmov q5, q6
83 ; CHECK-NEXT: vmov q6, q7
84 ; CHECK-NEXT: vmov q7, q1
85 ; CHECK-NEXT: vorr q1, q3, q2
86 ; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
87 ; CHECK-NEXT: vand q0, q0, q7
88 ; CHECK-NEXT: vmla.i16 q2, q0, r2
89 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
90 ; CHECK-NEXT: vand q0, q2, q0
91 ; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
92 ; CHECK-NEXT: vorr q0, q1, q0
94 ; CHECK-NEXT: vstrht.16 q0, [r5], #16
95 ; CHECK-NEXT: le lr, .LBB0_4
96 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
97 ; CHECK-NEXT: @ in Loop: Header=BB0_3 Depth=1
98 ; CHECK-NEXT: adds r4, #1
99 ; CHECK-NEXT: add r0, r1
100 ; CHECK-NEXT: cmp r4, r12
101 ; CHECK-NEXT: bne .LBB0_3
102 ; CHECK-NEXT: .LBB0_6:
103 ; CHECK-NEXT: add sp, #64
104 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
105 ; CHECK-NEXT: add sp, #4
106 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, lr}
109 %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
110 %0 = load i16, ptr %iHeight, align 2
111 %conv1 = sext i16 %0 to i32
112 %and.i = shl i16 %hwColour, 3
113 %shl.i = and i16 %and.i, 248
114 %1 = lshr i16 %hwColour, 9
115 %shl4.i = and i16 %1, 120
116 %2 = lshr i16 %hwColour, 3
118 %4 = trunc i32 %chRatio to i16
120 %conv30 = sext i16 %iTargetStride to i32
121 %cmp61 = icmp sgt i16 %0, 0
122 br i1 %cmp61, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
124 for.cond3.preheader.lr.ph: ; preds = %entry
125 %6 = load i16, ptr %ptCopySize, align 2
126 %conv4 = sext i16 %6 to i32
127 %cmp558 = icmp sgt i16 %6, 0
128 br i1 %cmp558, label %for.cond3.preheader.us.preheader, label %for.cond.cleanup
130 for.cond3.preheader.us.preheader: ; preds = %for.cond3.preheader.lr.ph
131 %conv15.us = mul i16 %shl.i, %4
132 %conv15.us.1 = mul i16 %3, %4
133 %conv15.us.2 = mul i16 %shl4.i, %4
134 %n.rnd.up = add nsw i32 %conv4, 7
135 %n.vec = and i32 %n.rnd.up, -8
136 %broadcast.splatinsert75 = insertelement <8 x i16> poison, i16 %5, i32 0
137 %broadcast.splat76 = shufflevector <8 x i16> %broadcast.splatinsert75, <8 x i16> poison, <8 x i32> zeroinitializer
138 %broadcast.splatinsert77 = insertelement <8 x i16> poison, i16 %conv15.us, i32 0
139 %broadcast.splat78 = shufflevector <8 x i16> %broadcast.splatinsert77, <8 x i16> poison, <8 x i32> zeroinitializer
140 %broadcast.splatinsert79 = insertelement <8 x i16> poison, i16 %conv15.us.1, i32 0
141 %broadcast.splat80 = shufflevector <8 x i16> %broadcast.splatinsert79, <8 x i16> poison, <8 x i32> zeroinitializer
142 %broadcast.splatinsert81 = insertelement <8 x i16> poison, i16 %conv15.us.2, i32 0
143 %broadcast.splat82 = shufflevector <8 x i16> %broadcast.splatinsert81, <8 x i16> poison, <8 x i32> zeroinitializer
146 vector.ph: ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.us.preheader
147 %phwTargetBase.addr.063.us = phi ptr [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
148 %y.062.us = phi i32 [ %inc32.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
149 br label %vector.body
151 vector.body: ; preds = %vector.body, %vector.ph
152 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
153 %next.gep = getelementptr i16, ptr %phwTargetBase.addr.063.us, i32 %index
154 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %conv4)
155 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
156 %7 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
157 %8 = and <8 x i16> %7, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
158 %9 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
159 %10 = and <8 x i16> %9, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
160 %11 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
161 %12 = and <8 x i16> %11, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
162 %13 = mul <8 x i16> %8, %broadcast.splat76
163 %14 = add <8 x i16> %13, %broadcast.splat78
164 %15 = lshr <8 x i16> %14, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
165 %16 = mul <8 x i16> %12, %broadcast.splat76
166 %17 = add <8 x i16> %16, %broadcast.splat80
167 %18 = mul <8 x i16> %10, %broadcast.splat76
168 %19 = add <8 x i16> %18, %broadcast.splat82
169 %20 = lshr <8 x i16> %17, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
170 %21 = and <8 x i16> %20, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
171 %22 = or <8 x i16> %21, %15
172 %23 = and <8 x i16> %19, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
173 %24 = or <8 x i16> %22, %23
174 call void @llvm.masked.store.v8i16.p0(<8 x i16> %24, ptr %next.gep, i32 2, <8 x i1> %active.lane.mask)
175 %index.next = add i32 %index, 8
176 %25 = icmp eq i32 %index.next, %n.vec
177 br i1 %25, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
179 for.cond3.for.cond.cleanup7_crit_edge.us: ; preds = %vector.body
180 %add.ptr.us = getelementptr inbounds i16, ptr %phwTargetBase.addr.063.us, i32 %conv30
181 %inc32.us = add nuw nsw i32 %y.062.us, 1
182 %exitcond66.not = icmp eq i32 %inc32.us, %conv1
183 br i1 %exitcond66.not, label %for.cond.cleanup, label %vector.ph
185 for.cond.cleanup: ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.lr.ph, %entry
188 define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) "target-cpu"="cortex-m55" {
189 ; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha_sched:
190 ; CHECK: @ %bb.0: @ %entry
191 ; CHECK-NEXT: ldrsh.w r12, [r2, #2]
192 ; CHECK-NEXT: cmp.w r12, #1
193 ; CHECK-NEXT: blt.w .LBB1_7
194 ; CHECK-NEXT: @ %bb.1: @ %for.cond3.preheader.lr.ph
195 ; CHECK-NEXT: ldrsh.w r2, [r2]
196 ; CHECK-NEXT: cmp r2, #1
198 ; CHECK-NEXT: bxlt lr
199 ; CHECK-NEXT: .LBB1_2: @ %for.cond3.preheader.us.preheader
200 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
201 ; CHECK-NEXT: sub sp, #4
202 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
203 ; CHECK-NEXT: sub sp, #80
204 ; CHECK-NEXT: ldr r7, [sp, #168]
205 ; CHECK-NEXT: movs r5, #120
206 ; CHECK-NEXT: lsls r6, r3, #3
207 ; CHECK-NEXT: movs r4, #252
208 ; CHECK-NEXT: and.w r5, r5, r3, lsr #9
209 ; CHECK-NEXT: uxtb r6, r6
210 ; CHECK-NEXT: and.w r3, r4, r3, lsr #3
211 ; CHECK-NEXT: muls r6, r7, r6
212 ; CHECK-NEXT: mul lr, r3, r7
213 ; CHECK-NEXT: vdup.16 q0, r6
214 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
215 ; CHECK-NEXT: vdup.16 q0, lr
216 ; CHECK-NEXT: muls r5, r7, r5
217 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
218 ; CHECK-NEXT: vmov.i16 q0, #0xfc
219 ; CHECK-NEXT: mov.w r6, #2016
220 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
221 ; CHECK-NEXT: vdup.16 q0, r5
222 ; CHECK-NEXT: rsb.w r3, r7, #256
223 ; CHECK-NEXT: lsls r7, r1, #1
224 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
225 ; CHECK-NEXT: vdup.16 q0, r6
226 ; CHECK-NEXT: vmov.i16 q2, #0xf8
227 ; CHECK-NEXT: vmov.i16 q5, #0x78
228 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
229 ; CHECK-NEXT: vmov.i16 q6, #0xf800
230 ; CHECK-NEXT: movs r4, #0
231 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
232 ; CHECK-NEXT: .p2align 2
233 ; CHECK-NEXT: .LBB1_3: @ %vector.ph
234 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
235 ; CHECK-NEXT: @ Child Loop BB1_4 Depth 2
236 ; CHECK-NEXT: mov r5, r0
237 ; CHECK-NEXT: dlstp.16 lr, r2
238 ; CHECK-NEXT: .p2align 2
239 ; CHECK-NEXT: .LBB1_4: @ %vector.body
240 ; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1
241 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
242 ; CHECK-NEXT: vldrh.u16 q0, [r5]
243 ; CHECK-NEXT: vshl.i16 q1, q0, #3
244 ; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
245 ; CHECK-NEXT: vand q1, q1, q2
246 ; CHECK-NEXT: vmla.i16 q3, q1, r3
247 ; CHECK-NEXT: vmov.f64 d8, d4
248 ; CHECK-NEXT: vmov.f64 d9, d5
249 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
250 ; CHECK-NEXT: vshr.u16 q2, q0, #9
251 ; CHECK-NEXT: vshr.u16 q0, q0, #3
252 ; CHECK-NEXT: vand q0, q0, q1
253 ; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload
254 ; CHECK-NEXT: vmla.i16 q1, q0, r3
255 ; CHECK-NEXT: vand q2, q2, q5
256 ; CHECK-NEXT: vshr.u16 q0, q3, #11
257 ; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload
258 ; CHECK-NEXT: vshr.u16 q1, q1, #5
259 ; CHECK-NEXT: vmla.i16 q3, q2, r3
260 ; CHECK-NEXT: vand q1, q1, q7
261 ; CHECK-NEXT: vorr q0, q1, q0
262 ; CHECK-NEXT: vand q1, q3, q6
263 ; CHECK-NEXT: vorr q0, q0, q1
264 ; CHECK-NEXT: vstrh.16 q0, [r5], #16
265 ; CHECK-NEXT: vmov.f64 d4, d8
266 ; CHECK-NEXT: vmov.f64 d5, d9
267 ; CHECK-NEXT: letp lr, .LBB1_4
268 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
269 ; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1
270 ; CHECK-NEXT: adds r4, #1
271 ; CHECK-NEXT: add r0, r7
272 ; CHECK-NEXT: cmp r4, r12
273 ; CHECK-NEXT: bne .LBB1_3
274 ; CHECK-NEXT: @ %bb.6:
275 ; CHECK-NEXT: add sp, #80
276 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
277 ; CHECK-NEXT: add sp, #4
278 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, lr}
279 ; CHECK-NEXT: .LBB1_7: @ %for.cond.cleanup
282 %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
283 %0 = load i16, ptr %iHeight, align 2
284 %conv1 = sext i16 %0 to i32
285 %and.i = shl i16 %hwColour, 3
286 %shl.i = and i16 %and.i, 248
287 %1 = lshr i16 %hwColour, 9
288 %shl4.i = and i16 %1, 120
289 %2 = lshr i16 %hwColour, 3
291 %4 = trunc i32 %chRatio to i16
293 %conv30 = sext i16 %iTargetStride to i32
294 %cmp61 = icmp sgt i16 %0, 0
295 br i1 %cmp61, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
297 for.cond3.preheader.lr.ph: ; preds = %entry
298 %6 = load i16, ptr %ptCopySize, align 2
299 %conv4 = sext i16 %6 to i32
300 %cmp558 = icmp sgt i16 %6, 0
301 br i1 %cmp558, label %for.cond3.preheader.us.preheader, label %for.cond.cleanup
303 for.cond3.preheader.us.preheader: ; preds = %for.cond3.preheader.lr.ph
304 %conv15.us = mul i16 %shl.i, %4
305 %conv15.us.1 = mul i16 %3, %4
306 %conv15.us.2 = mul i16 %shl4.i, %4
307 %n.rnd.up = add nsw i32 %conv4, 7
308 %n.vec = and i32 %n.rnd.up, -8
309 %broadcast.splatinsert75 = insertelement <8 x i16> poison, i16 %5, i32 0
310 %broadcast.splat76 = shufflevector <8 x i16> %broadcast.splatinsert75, <8 x i16> poison, <8 x i32> zeroinitializer
311 %broadcast.splatinsert77 = insertelement <8 x i16> poison, i16 %conv15.us, i32 0
312 %broadcast.splat78 = shufflevector <8 x i16> %broadcast.splatinsert77, <8 x i16> poison, <8 x i32> zeroinitializer
313 %broadcast.splatinsert79 = insertelement <8 x i16> poison, i16 %conv15.us.1, i32 0
314 %broadcast.splat80 = shufflevector <8 x i16> %broadcast.splatinsert79, <8 x i16> poison, <8 x i32> zeroinitializer
315 %broadcast.splatinsert81 = insertelement <8 x i16> poison, i16 %conv15.us.2, i32 0
316 %broadcast.splat82 = shufflevector <8 x i16> %broadcast.splatinsert81, <8 x i16> poison, <8 x i32> zeroinitializer
319 vector.ph: ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.us.preheader
320 %phwTargetBase.addr.063.us = phi ptr [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
321 %y.062.us = phi i32 [ %inc32.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
322 br label %vector.body
324 vector.body: ; preds = %vector.body, %vector.ph
325 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
326 %next.gep = getelementptr i16, ptr %phwTargetBase.addr.063.us, i32 %index
327 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %conv4)
328 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
329 %7 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
330 %8 = and <8 x i16> %7, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
331 %9 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
332 %10 = and <8 x i16> %9, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
333 %11 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
334 %12 = and <8 x i16> %11, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
335 %13 = mul <8 x i16> %8, %broadcast.splat76
336 %14 = add <8 x i16> %13, %broadcast.splat78
337 %15 = lshr <8 x i16> %14, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
338 %16 = mul <8 x i16> %12, %broadcast.splat76
339 %17 = add <8 x i16> %16, %broadcast.splat80
340 %18 = mul <8 x i16> %10, %broadcast.splat76
341 %19 = add <8 x i16> %18, %broadcast.splat82
342 %20 = lshr <8 x i16> %17, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
343 %21 = and <8 x i16> %20, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
344 %22 = or <8 x i16> %21, %15
345 %23 = and <8 x i16> %19, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
346 %24 = or <8 x i16> %22, %23
347 call void @llvm.masked.store.v8i16.p0(<8 x i16> %24, ptr %next.gep, i32 2, <8 x i1> %active.lane.mask)
348 %index.next = add i32 %index, 8
349 %25 = icmp eq i32 %index.next, %n.vec
350 br i1 %25, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
352 for.cond3.for.cond.cleanup7_crit_edge.us: ; preds = %vector.body
353 %add.ptr.us = getelementptr inbounds i16, ptr %phwTargetBase.addr.063.us, i32 %conv30
354 %inc32.us = add nuw nsw i32 %y.062.us, 1
355 %exitcond66.not = icmp eq i32 %inc32.us, %conv1
356 br i1 %exitcond66.not, label %for.cond.cleanup, label %vector.ph
358 for.cond.cleanup: ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.lr.ph, %entry
362 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
363 declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) #2
364 declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>) #3