1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -o - %s | FileCheck --check-prefix=CHECK %s
4 define void @tailpred(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) {
5 ; CHECK-LABEL: tailpred:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r4, lr}
8 ; CHECK-NEXT: push {r4, lr}
9 ; CHECK-NEXT: cmp r3, #0
11 ; CHECK-NEXT: popeq {r4, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %vector.memcheck
13 ; CHECK-NEXT: add.w r12, r1, r3, lsl #1
14 ; CHECK-NEXT: add.w lr, r2, r3, lsl #1
15 ; CHECK-NEXT: cmp r12, r2
16 ; CHECK-NEXT: add.w r4, r0, r3, lsl #1
17 ; CHECK-NEXT: cset r12, hi
18 ; CHECK-NEXT: cmp lr, r1
19 ; CHECK-NEXT: csel r12, zr, r12, ls
20 ; CHECK-NEXT: cmp lr, r0
21 ; CHECK-NEXT: cset lr, hi
22 ; CHECK-NEXT: cmp r4, r2
23 ; CHECK-NEXT: cset r4, hi
24 ; CHECK-NEXT: tst.w r4, lr
26 ; CHECK-NEXT: cmpeq.w r12, #0
27 ; CHECK-NEXT: beq .LBB0_4
28 ; CHECK-NEXT: @ %bb.2: @ %while.body.preheader
29 ; CHECK-NEXT: dls lr, r3
30 ; CHECK-NEXT: .LBB0_3: @ %while.body
31 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
32 ; CHECK-NEXT: vldr.16 s0, [r0]
33 ; CHECK-NEXT: vldr.16 s2, [r1]
34 ; CHECK-NEXT: adds r1, #2
35 ; CHECK-NEXT: adds r0, #2
36 ; CHECK-NEXT: vadd.f16 s0, s2, s0
37 ; CHECK-NEXT: vstr.16 s0, [r2]
38 ; CHECK-NEXT: adds r2, #2
39 ; CHECK-NEXT: le lr, .LBB0_3
40 ; CHECK-NEXT: b .LBB0_6
41 ; CHECK-NEXT: .LBB0_4: @ %vector.ph
42 ; CHECK-NEXT: dlstp.16 lr, r3
43 ; CHECK-NEXT: .LBB0_5: @ %vector.body
44 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
45 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
46 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
47 ; CHECK-NEXT: vadd.f16 q0, q1, q0
48 ; CHECK-NEXT: vstrh.16 q0, [r2], #16
49 ; CHECK-NEXT: letp lr, .LBB0_5
50 ; CHECK-NEXT: .LBB0_6: @ %while.end
51 ; CHECK-NEXT: pop {r4, pc}
53 %cmp.not6 = icmp eq i32 %blockSize, 0
54 br i1 %cmp.not6, label %while.end, label %vector.memcheck
56 vector.memcheck: ; preds = %entry
57 %scevgep = getelementptr half, ptr %pDst, i32 %blockSize
58 %scevgep14 = getelementptr half, ptr %pSrcA, i32 %blockSize
59 %scevgep17 = getelementptr half, ptr %pSrcB, i32 %blockSize
60 %bound0 = icmp ugt ptr %scevgep14, %pDst
61 %bound1 = icmp ugt ptr %scevgep, %pSrcA
62 %found.conflict = and i1 %bound0, %bound1
63 %bound019 = icmp ugt ptr %scevgep17, %pDst
64 %bound120 = icmp ugt ptr %scevgep, %pSrcB
65 %found.conflict21 = and i1 %bound019, %bound120
66 %conflict.rdx = or i1 %found.conflict, %found.conflict21
67 br i1 %conflict.rdx, label %while.body, label %vector.ph
69 vector.ph: ; preds = %vector.memcheck
70 %n.rnd.up = add i32 %blockSize, 7
71 %n.vec = and i32 %n.rnd.up, -8
74 vector.body: ; preds = %vector.body, %vector.ph
75 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
76 %next.gep = getelementptr half, ptr %pSrcA, i32 %index
77 %next.gep28 = getelementptr half, ptr %pDst, i32 %index
78 %next.gep29 = getelementptr half, ptr %pSrcB, i32 %index
79 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize)
80 %wide.masked.load = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef)
81 %wide.masked.load32 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %next.gep29, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef)
82 %0 = fadd fast <8 x half> %wide.masked.load32, %wide.masked.load
83 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %next.gep28, i32 2, <8 x i1> %active.lane.mask)
84 %index.next = add i32 %index, 8
85 %1 = icmp eq i32 %index.next, %n.vec
86 br i1 %1, label %while.end, label %vector.body
88 while.body: ; preds = %vector.memcheck, %while.body
89 %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blockSize, %vector.memcheck ]
90 %pSrcA.addr.09 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %vector.memcheck ]
91 %pDst.addr.08 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst, %vector.memcheck ]
92 %pSrcB.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrcB, %vector.memcheck ]
93 %incdec.ptr = getelementptr inbounds half, ptr %pSrcA.addr.09, i32 1
94 %2 = load half, ptr %pSrcA.addr.09, align 2
95 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrcB.addr.07, i32 1
96 %3 = load half, ptr %pSrcB.addr.07, align 2
97 %4 = fadd fast half %3, %2
98 %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.08, i32 1
99 store half %4, ptr %pDst.addr.08, align 2
100 %dec = add i32 %blkCnt.010, -1
101 %cmp.not = icmp eq i32 %dec, 0
102 br i1 %cmp.not, label %while.end, label %while.body
104 while.end: ; preds = %vector.body, %while.body, %entry
108 define void @notailpred(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) {
109 ; CHECK-LABEL: notailpred:
110 ; CHECK: @ %bb.0: @ %entry
111 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
112 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
113 ; CHECK-NEXT: cbz r3, .LBB1_6
114 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
115 ; CHECK-NEXT: cmp r3, #8
116 ; CHECK-NEXT: blo .LBB1_3
117 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
118 ; CHECK-NEXT: add.w r7, r1, r3, lsl #1
119 ; CHECK-NEXT: add.w r6, r2, r3, lsl #1
120 ; CHECK-NEXT: cmp r7, r2
121 ; CHECK-NEXT: add.w r5, r0, r3, lsl #1
122 ; CHECK-NEXT: cset r7, hi
123 ; CHECK-NEXT: cmp r6, r1
124 ; CHECK-NEXT: csel r7, zr, r7, ls
125 ; CHECK-NEXT: cmp r6, r0
126 ; CHECK-NEXT: cset r6, hi
127 ; CHECK-NEXT: cmp r5, r2
128 ; CHECK-NEXT: cset r5, hi
129 ; CHECK-NEXT: tst r5, r6
131 ; CHECK-NEXT: cmpeq r7, #0
132 ; CHECK-NEXT: beq .LBB1_7
133 ; CHECK-NEXT: .LBB1_3:
134 ; CHECK-NEXT: mov r5, r3
135 ; CHECK-NEXT: mov r12, r0
136 ; CHECK-NEXT: mov r7, r2
137 ; CHECK-NEXT: mov r4, r1
138 ; CHECK-NEXT: .LBB1_4: @ %while.body.preheader31
139 ; CHECK-NEXT: dls lr, r5
140 ; CHECK-NEXT: .LBB1_5: @ %while.body
141 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
142 ; CHECK-NEXT: vldr.16 s0, [r12]
143 ; CHECK-NEXT: vldr.16 s2, [r4]
144 ; CHECK-NEXT: adds r4, #2
145 ; CHECK-NEXT: add.w r12, r12, #2
146 ; CHECK-NEXT: vadd.f16 s0, s2, s0
147 ; CHECK-NEXT: vstr.16 s0, [r7]
148 ; CHECK-NEXT: adds r7, #2
149 ; CHECK-NEXT: le lr, .LBB1_5
150 ; CHECK-NEXT: .LBB1_6: @ %while.end
151 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
152 ; CHECK-NEXT: .LBB1_7: @ %vector.ph
153 ; CHECK-NEXT: bic r6, r3, #7
154 ; CHECK-NEXT: movs r5, #1
155 ; CHECK-NEXT: sub.w r7, r6, #8
156 ; CHECK-NEXT: add.w r4, r1, r6, lsl #1
157 ; CHECK-NEXT: add.w r12, r0, r6, lsl #1
158 ; CHECK-NEXT: add.w lr, r5, r7, lsr #3
159 ; CHECK-NEXT: add.w r7, r2, r6, lsl #1
160 ; CHECK-NEXT: and r5, r3, #7
161 ; CHECK-NEXT: .LBB1_8: @ %vector.body
162 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
163 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
164 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
165 ; CHECK-NEXT: vadd.f16 q0, q1, q0
166 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
167 ; CHECK-NEXT: le lr, .LBB1_8
168 ; CHECK-NEXT: @ %bb.9: @ %middle.block
169 ; CHECK-NEXT: cmp r6, r3
170 ; CHECK-NEXT: bne .LBB1_4
171 ; CHECK-NEXT: b .LBB1_6
173 %cmp.not6 = icmp eq i32 %blockSize, 0
174 br i1 %cmp.not6, label %while.end, label %while.body.preheader
176 while.body.preheader: ; preds = %entry
177 %min.iters.check = icmp ult i32 %blockSize, 8
178 br i1 %min.iters.check, label %while.body.preheader31, label %vector.memcheck
180 vector.memcheck: ; preds = %while.body.preheader
181 %scevgep = getelementptr half, ptr %pDst, i32 %blockSize
182 %scevgep14 = getelementptr half, ptr %pSrcA, i32 %blockSize
183 %scevgep17 = getelementptr half, ptr %pSrcB, i32 %blockSize
184 %bound0 = icmp ugt ptr %scevgep14, %pDst
185 %bound1 = icmp ugt ptr %scevgep, %pSrcA
186 %found.conflict = and i1 %bound0, %bound1
187 %bound019 = icmp ugt ptr %scevgep17, %pDst
188 %bound120 = icmp ugt ptr %scevgep, %pSrcB
189 %found.conflict21 = and i1 %bound019, %bound120
190 %conflict.rdx = or i1 %found.conflict, %found.conflict21
191 br i1 %conflict.rdx, label %while.body.preheader31, label %vector.ph
193 vector.ph: ; preds = %vector.memcheck
194 %n.vec = and i32 %blockSize, -8
195 %ind.end = and i32 %blockSize, 7
196 %ind.end23 = getelementptr half, ptr %pSrcA, i32 %n.vec
197 %ind.end25 = getelementptr half, ptr %pDst, i32 %n.vec
198 %ind.end27 = getelementptr half, ptr %pSrcB, i32 %n.vec
199 br label %vector.body
201 vector.body: ; preds = %vector.body, %vector.ph
202 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
203 %next.gep = getelementptr half, ptr %pSrcA, i32 %index
204 %next.gep28 = getelementptr half, ptr %pDst, i32 %index
205 %next.gep29 = getelementptr half, ptr %pSrcB, i32 %index
206 %wide.load = load <8 x half>, ptr %next.gep, align 2
207 %wide.load30 = load <8 x half>, ptr %next.gep29, align 2
208 %0 = fadd fast <8 x half> %wide.load30, %wide.load
209 store <8 x half> %0, ptr %next.gep28, align 2
210 %index.next = add i32 %index, 8
211 %1 = icmp eq i32 %index.next, %n.vec
212 br i1 %1, label %middle.block, label %vector.body
214 middle.block: ; preds = %vector.body
215 %cmp.n = icmp eq i32 %n.vec, %blockSize
216 br i1 %cmp.n, label %while.end, label %while.body.preheader31
218 while.body.preheader31: ; preds = %middle.block, %vector.memcheck, %while.body.preheader
219 %blkCnt.010.ph = phi i32 [ %blockSize, %vector.memcheck ], [ %blockSize, %while.body.preheader ], [ %ind.end, %middle.block ]
220 %pSrcA.addr.09.ph = phi ptr [ %pSrcA, %vector.memcheck ], [ %pSrcA, %while.body.preheader ], [ %ind.end23, %middle.block ]
221 %pDst.addr.08.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end25, %middle.block ]
222 %pSrcB.addr.07.ph = phi ptr [ %pSrcB, %vector.memcheck ], [ %pSrcB, %while.body.preheader ], [ %ind.end27, %middle.block ]
225 while.body: ; preds = %while.body.preheader31, %while.body
226 %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blkCnt.010.ph, %while.body.preheader31 ]
227 %pSrcA.addr.09 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA.addr.09.ph, %while.body.preheader31 ]
228 %pDst.addr.08 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.08.ph, %while.body.preheader31 ]
229 %pSrcB.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrcB.addr.07.ph, %while.body.preheader31 ]
230 %incdec.ptr = getelementptr inbounds half, ptr %pSrcA.addr.09, i32 1
231 %2 = load half, ptr %pSrcA.addr.09, align 2
232 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrcB.addr.07, i32 1
233 %3 = load half, ptr %pSrcB.addr.07, align 2
234 %4 = fadd fast half %3, %2
235 %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.08, i32 1
236 store half %4, ptr %pDst.addr.08, align 2
237 %dec = add i32 %blkCnt.010, -1
238 %cmp.not = icmp eq i32 %dec, 0
239 br i1 %cmp.not, label %while.end, label %while.body
241 while.end: ; preds = %while.body, %middle.block, %entry
245 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
246 declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>) #2
247 declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>) #3