1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | \
4 ; RUN: FileCheck %s --check-prefix=ENABLED
6 ; Forcing tail-predication should not be necessary here, so we check the same
7 ; ENABLED label as the run above:
8 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=force-enabled %s -o - | \
9 ; RUN: FileCheck %s --check-prefix=ENABLED
11 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled-no-reductions %s -o - | \
12 ; RUN: FileCheck %s --check-prefix=NOREDUCTIONS
14 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=force-enabled-no-reductions %s -o - | \
15 ; RUN: FileCheck %s --check-prefix=NOREDUCTIONS
17 define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
18 ; ENABLED-LABEL: varying_outer_2d_reduction:
19 ; ENABLED: @ %bb.0: @ %entry
20 ; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
21 ; ENABLED-NEXT: sub sp, #4
22 ; ENABLED-NEXT: cmp r3, #1
23 ; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill
24 ; ENABLED-NEXT: blt .LBB0_8
25 ; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph
26 ; ENABLED-NEXT: ldr r0, [sp, #36]
27 ; ENABLED-NEXT: add.w r12, r2, #3
28 ; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload
29 ; ENABLED-NEXT: mov.w r8, #0
30 ; ENABLED-NEXT: mov r9, r12
31 ; ENABLED-NEXT: uxth r0, r0
32 ; ENABLED-NEXT: rsbs r5, r0, #0
33 ; ENABLED-NEXT: b .LBB0_4
34 ; ENABLED-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
35 ; ENABLED-NEXT: movs r0, #0
36 ; ENABLED-NEXT: .LBB0_3: @ %for.end
37 ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
38 ; ENABLED-NEXT: lsrs r0, r0, #16
39 ; ENABLED-NEXT: sub.w r9, r9, #1
40 ; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1]
41 ; ENABLED-NEXT: add.w r8, r8, #1
42 ; ENABLED-NEXT: add.w r10, r10, #2
43 ; ENABLED-NEXT: cmp r8, r3
44 ; ENABLED-NEXT: beq .LBB0_8
45 ; ENABLED-NEXT: .LBB0_4: @ %for.body
46 ; ENABLED-NEXT: @ =>This Loop Header: Depth=1
47 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2
48 ; ENABLED-NEXT: cmp r2, r8
49 ; ENABLED-NEXT: ble .LBB0_2
50 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph
51 ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
52 ; ENABLED-NEXT: bic r0, r9, #3
53 ; ENABLED-NEXT: movs r7, #1
54 ; ENABLED-NEXT: subs r0, #4
55 ; ENABLED-NEXT: sub.w r4, r2, r8
56 ; ENABLED-NEXT: vmov.i32 q1, #0x0
57 ; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
58 ; ENABLED-NEXT: sub.w r0, r12, r8
59 ; ENABLED-NEXT: bic r0, r0, #3
60 ; ENABLED-NEXT: subs r0, #4
61 ; ENABLED-NEXT: add.w r0, r7, r0, lsr #2
62 ; ENABLED-NEXT: mov r7, r10
63 ; ENABLED-NEXT: dls lr, r0
64 ; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload
65 ; ENABLED-NEXT: .LBB0_6: @ %vector.body
66 ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
67 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
68 ; ENABLED-NEXT: vctp.32 r4
69 ; ENABLED-NEXT: vmov q0, q1
71 ; ENABLED-NEXT: vldrht.s32 q1, [r0], #8
72 ; ENABLED-NEXT: vldrht.s32 q2, [r7], #8
73 ; ENABLED-NEXT: mov lr, r6
74 ; ENABLED-NEXT: vmul.i32 q1, q2, q1
75 ; ENABLED-NEXT: subs r6, #1
76 ; ENABLED-NEXT: vshl.s32 q1, r5
77 ; ENABLED-NEXT: subs r4, #4
78 ; ENABLED-NEXT: vadd.i32 q1, q1, q0
79 ; ENABLED-NEXT: le lr, .LBB0_6
80 ; ENABLED-NEXT: @ %bb.7: @ %middle.block
81 ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
82 ; ENABLED-NEXT: vpsel q0, q1, q0
83 ; ENABLED-NEXT: vaddv.u32 r0, q0
84 ; ENABLED-NEXT: b .LBB0_3
85 ; ENABLED-NEXT: .LBB0_8: @ %for.end17
86 ; ENABLED-NEXT: add sp, #4
87 ; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
89 ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
90 ; NOREDUCTIONS: @ %bb.0: @ %entry
91 ; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
92 ; NOREDUCTIONS-NEXT: sub sp, #4
93 ; NOREDUCTIONS-NEXT: cmp r3, #1
94 ; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill
95 ; NOREDUCTIONS-NEXT: blt .LBB0_8
96 ; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph
97 ; NOREDUCTIONS-NEXT: ldr r0, [sp, #36]
98 ; NOREDUCTIONS-NEXT: add.w r12, r2, #3
99 ; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload
100 ; NOREDUCTIONS-NEXT: mov.w r8, #0
101 ; NOREDUCTIONS-NEXT: mov r9, r12
102 ; NOREDUCTIONS-NEXT: uxth r0, r0
103 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
104 ; NOREDUCTIONS-NEXT: b .LBB0_4
105 ; NOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
106 ; NOREDUCTIONS-NEXT: movs r0, #0
107 ; NOREDUCTIONS-NEXT: .LBB0_3: @ %for.end
108 ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
109 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16
110 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1
111 ; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1]
112 ; NOREDUCTIONS-NEXT: add.w r8, r8, #1
113 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2
114 ; NOREDUCTIONS-NEXT: cmp r8, r3
115 ; NOREDUCTIONS-NEXT: beq .LBB0_8
116 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body
117 ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1
118 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2
119 ; NOREDUCTIONS-NEXT: cmp r2, r8
120 ; NOREDUCTIONS-NEXT: ble .LBB0_2
121 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
122 ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
123 ; NOREDUCTIONS-NEXT: bic r0, r9, #3
124 ; NOREDUCTIONS-NEXT: movs r7, #1
125 ; NOREDUCTIONS-NEXT: subs r0, #4
126 ; NOREDUCTIONS-NEXT: sub.w r4, r2, r8
127 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
128 ; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
129 ; NOREDUCTIONS-NEXT: sub.w r0, r12, r8
130 ; NOREDUCTIONS-NEXT: bic r0, r0, #3
131 ; NOREDUCTIONS-NEXT: subs r0, #4
132 ; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2
133 ; NOREDUCTIONS-NEXT: mov r7, r10
134 ; NOREDUCTIONS-NEXT: dls lr, r0
135 ; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload
136 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
137 ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
138 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2
139 ; NOREDUCTIONS-NEXT: vctp.32 r4
140 ; NOREDUCTIONS-NEXT: vmov q0, q1
141 ; NOREDUCTIONS-NEXT: vpstt
142 ; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8
143 ; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8
144 ; NOREDUCTIONS-NEXT: mov lr, r6
145 ; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1
146 ; NOREDUCTIONS-NEXT: subs r6, #1
147 ; NOREDUCTIONS-NEXT: vshl.s32 q1, r5
148 ; NOREDUCTIONS-NEXT: subs r4, #4
149 ; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0
150 ; NOREDUCTIONS-NEXT: le lr, .LBB0_6
151 ; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block
152 ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
153 ; NOREDUCTIONS-NEXT: vpsel q0, q1, q0
154 ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
155 ; NOREDUCTIONS-NEXT: b .LBB0_3
156 ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17
157 ; NOREDUCTIONS-NEXT: add sp, #4
158 ; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
160 %conv = sext i16 %N to i32
161 %cmp36 = icmp sgt i16 %N, 0
162 br i1 %cmp36, label %for.body.lr.ph, label %for.end17
164 for.body.lr.ph: ; preds = %entry
165 %conv2 = sext i16 %Size to i32
166 %conv1032 = zext i16 %Scale to i32
167 %i = add i32 %conv2, 3
170 for.body: ; preds = %for.end, %for.body.lr.ph
171 %lsr.iv51 = phi i32 [ %lsr.iv.next, %for.end ], [ %i, %for.body.lr.ph ]
172 %lsr.iv46 = phi i16* [ %scevgep47, %for.end ], [ %Input, %for.body.lr.ph ]
173 %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ]
174 %i1 = mul nsw i32 %i.037, -1
175 %i2 = add i32 %i, %i1
176 %i3 = lshr i32 %i2, 2
177 %i4 = shl nuw i32 %i3, 2
178 %i5 = add i32 %i4, -4
179 %i6 = lshr i32 %i5, 2
180 %i7 = add nuw nsw i32 %i6, 1
181 %i8 = sub i32 %conv2, %i.037
182 %cmp433 = icmp slt i32 %i.037, %conv2
183 br i1 %cmp433, label %vector.ph, label %for.end
185 vector.ph: ; preds = %for.body
186 %start = call i32 @llvm.start.loop.iterations.i32(i32 %i7)
187 br label %vector.body
189 vector.body: ; preds = %vector.body, %vector.ph
190 %lsr.iv48 = phi i16* [ %scevgep49, %vector.body ], [ %lsr.iv46, %vector.ph ]
191 %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ]
192 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
193 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i16, %vector.body ]
194 %i9 = phi i32 [ %start, %vector.ph ], [ %i17, %vector.body ]
195 %lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
196 %lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
197 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i8)
198 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
199 %i10 = sext <4 x i16> %wide.masked.load to <4 x i32>
200 %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
201 %i11 = sext <4 x i16> %wide.masked.load42 to <4 x i32>
202 %i12 = mul nsw <4 x i32> %i11, %i10
203 %i13 = insertelement <4 x i32> undef, i32 %conv1032, i32 0
204 %i14 = shufflevector <4 x i32> %i13, <4 x i32> undef, <4 x i32> zeroinitializer
205 %i15 = ashr <4 x i32> %i12, %i14
206 %i16 = add <4 x i32> %i15, %vec.phi
207 %index.next = add i32 %index, 4
208 %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
209 %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 4
210 %i17 = call i32 @llvm.loop.decrement.reg.i32(i32 %i9, i32 1)
211 %i18 = icmp ne i32 %i17, 0
212 br i1 %i18, label %vector.body, label %middle.block
214 middle.block: ; preds = %vector.body
215 %i19 = select <4 x i1> %active.lane.mask, <4 x i32> %i16, <4 x i32> %vec.phi
216 %i20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i19)
219 for.end: ; preds = %middle.block, %for.body
220 %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %i20, %middle.block ]
221 %i21 = lshr i32 %Sum.0.lcssa, 16
222 %conv13 = trunc i32 %i21 to i16
223 %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037
224 store i16 %conv13, i16* %arrayidx14, align 2
225 %inc16 = add nuw nsw i32 %i.037, 1
226 %scevgep47 = getelementptr i16, i16* %lsr.iv46, i32 1
227 %lsr.iv.next = add i32 %lsr.iv51, -1
228 %exitcond39 = icmp eq i32 %inc16, %conv
229 br i1 %exitcond39, label %for.end17, label %for.body
231 for.end17: ; preds = %for.end, %entry
235 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
236 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
237 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
238 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
239 declare i32 @llvm.start.loop.iterations.i32(i32)