1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -enable-arm-maskedgatscat=false %s -o - | FileCheck %s
4 define void @remat_vctp(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i16 zeroext %arg5) {
5 ; CHECK-LABEL: remat_vctp:
6 ; CHECK: @ %bb.0: @ %bb
7 ; CHECK-NEXT: push {r4, r5, r7, lr}
8 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
9 ; CHECK-NEXT: ldrd r5, r12, [sp, #64]
10 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000
11 ; CHECK-NEXT: vmov.i32 q1, #0x3f
12 ; CHECK-NEXT: movs r4, #1
13 ; CHECK-NEXT: dlstp.32 lr, r12
14 ; CHECK-NEXT: .LBB0_1: @ %bb6
15 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
16 ; CHECK-NEXT: vldrw.u32 q3, [r1], #16
17 ; CHECK-NEXT: vabs.s32 q4, q3
18 ; CHECK-NEXT: vcls.s32 q2, q4
19 ; CHECK-NEXT: vshl.u32 q4, q4, q2
20 ; CHECK-NEXT: vadd.i32 q2, q2, r4
21 ; CHECK-NEXT: vshr.u32 q5, q4, #24
22 ; CHECK-NEXT: vand q5, q5, q1
23 ; CHECK-NEXT: vldrw.u32 q6, [r5, q5, uxtw #2]
24 ; CHECK-NEXT: vqrdmulh.s32 q5, q6, q4
25 ; CHECK-NEXT: vqsub.s32 q5, q0, q5
26 ; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5
27 ; CHECK-NEXT: vqshl.s32 q5, q5, #1
28 ; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4
29 ; CHECK-NEXT: vqsub.s32 q4, q0, q4
30 ; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4
31 ; CHECK-NEXT: vqshl.s32 q4, q4, #1
32 ; CHECK-NEXT: vpt.s32 lt, q3, zr
33 ; CHECK-NEXT: vnegt.s32 q4, q4
34 ; CHECK-NEXT: vldrw.u32 q3, [r0], #16
35 ; CHECK-NEXT: vqrdmulh.s32 q3, q3, q4
36 ; CHECK-NEXT: vstrw.32 q3, [r2], #16
37 ; CHECK-NEXT: vstrw.32 q2, [r3], #16
38 ; CHECK-NEXT: letp lr, .LBB0_1
39 ; CHECK-NEXT: @ %bb.2: @ %bb44
40 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
41 ; CHECK-NEXT: pop {r4, r5, r7, pc}
43 %i = zext i16 %arg5 to i32
46 bb6: ; preds = %bb6, %bb
47 %i7 = phi ptr [ %arg3, %bb ], [ %i38, %bb6 ]
48 %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ]
49 %i9 = phi ptr [ %arg2, %bb ], [ %i41, %bb6 ]
50 %i10 = phi ptr [ %arg1, %bb ], [ %i40, %bb6 ]
51 %i11 = phi ptr [ %arg, %bb ], [ %i39, %bb6 ]
52 %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8)
53 %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i11, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer)
54 %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i10, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer)
55 %i17 = icmp slt <4 x i32> %i16, zeroinitializer
56 %i18 = sub <4 x i32> zeroinitializer, %i16
57 %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16
58 %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19)
59 %i21 = shl <4 x i32> %i19, %i20
60 %i22 = add <4 x i32> %i20, <i32 1, i32 1, i32 1, i32 1>
61 %i23 = lshr <4 x i32> %i21, <i32 24, i32 24, i32 24, i32 24>
62 %i24 = and <4 x i32> %i23, <i32 63, i32 63, i32 63, i32 63>
63 %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0)
64 %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21)
65 %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i26)
66 %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27)
67 %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0)
68 %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21)
69 %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i30)
70 %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31)
71 %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0)
72 %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33)
73 %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34)
74 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i35, ptr %i9, i32 4, <4 x i1> %i12)
75 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i22, ptr %i7, i32 4, <4 x i1> %i12)
76 %i38 = getelementptr inbounds i32, ptr %i7, i32 4
77 %i39 = getelementptr inbounds i32, ptr %i11, i32 4
78 %i40 = getelementptr inbounds i32, ptr %i10, i32 4
79 %i41 = getelementptr inbounds i32, ptr %i9, i32 4
80 %i42 = add nsw i32 %i8, -4
81 %i43 = icmp sgt i32 %i8, 4
82 br i1 %i43, label %bb6, label %bb44
88 define void @dont_remat_predicated_vctp(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i16 zeroext %arg5, i32 %conv.mask) {
89 ; CHECK-LABEL: dont_remat_predicated_vctp:
90 ; CHECK: @ %bb.0: @ %bb
91 ; CHECK-NEXT: push {r4, r5, r6, lr}
92 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
93 ; CHECK-NEXT: sub sp, #8
94 ; CHECK-NEXT: ldrd r6, r12, [sp, #72]
95 ; CHECK-NEXT: movs r4, #4
96 ; CHECK-NEXT: cmp.w r12, #4
97 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000
98 ; CHECK-NEXT: csel r5, r12, r4, lt
99 ; CHECK-NEXT: vmov.i32 q1, #0x3f
100 ; CHECK-NEXT: sub.w r5, r12, r5
101 ; CHECK-NEXT: add.w lr, r5, #3
102 ; CHECK-NEXT: movs r5, #1
103 ; CHECK-NEXT: add.w lr, r5, lr, lsr #2
104 ; CHECK-NEXT: .LBB1_1: @ %bb6
105 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
106 ; CHECK-NEXT: vctp.32 r12
107 ; CHECK-NEXT: sub.w r12, r12, #4
109 ; CHECK-NEXT: vctpt.32 r4
110 ; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
112 ; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
113 ; CHECK-NEXT: vabs.s32 q4, q3
114 ; CHECK-NEXT: vcls.s32 q2, q4
115 ; CHECK-NEXT: vshl.u32 q4, q4, q2
116 ; CHECK-NEXT: vadd.i32 q2, q2, r5
117 ; CHECK-NEXT: vshr.u32 q5, q4, #24
118 ; CHECK-NEXT: vand q5, q5, q1
119 ; CHECK-NEXT: vldrw.u32 q6, [r6, q5, uxtw #2]
120 ; CHECK-NEXT: vqrdmulh.s32 q5, q6, q4
121 ; CHECK-NEXT: vqsub.s32 q5, q0, q5
122 ; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5
123 ; CHECK-NEXT: vqshl.s32 q5, q5, #1
124 ; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4
125 ; CHECK-NEXT: vqsub.s32 q4, q0, q4
126 ; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4
127 ; CHECK-NEXT: vqshl.s32 q4, q4, #1
128 ; CHECK-NEXT: vpt.s32 lt, q3, zr
129 ; CHECK-NEXT: vnegt.s32 q4, q4
130 ; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
132 ; CHECK-NEXT: vldrwt.u32 q3, [r0], #16
133 ; CHECK-NEXT: vqrdmulh.s32 q3, q3, q4
135 ; CHECK-NEXT: vstrwt.32 q3, [r2], #16
136 ; CHECK-NEXT: vstrwt.32 q2, [r3], #16
137 ; CHECK-NEXT: le lr, .LBB1_1
138 ; CHECK-NEXT: @ %bb.2: @ %bb44
139 ; CHECK-NEXT: add sp, #8
140 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
141 ; CHECK-NEXT: pop {r4, r5, r6, pc}
143 %i = zext i16 %arg5 to i32
146 bb6: ; preds = %bb6, %bb
147 %i7 = phi ptr [ %arg3, %bb ], [ %i38, %bb6 ]
148 %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ]
149 %i9 = phi ptr [ %arg2, %bb ], [ %i41, %bb6 ]
150 %i10 = phi ptr [ %arg1, %bb ], [ %i40, %bb6 ]
151 %i11 = phi ptr [ %arg, %bb ], [ %i39, %bb6 ]
152 %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 4)
153 %mask = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8)
154 %pred = and <4 x i1> %i12, %mask
155 %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i11, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer)
156 %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i10, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer)
157 %i17 = icmp slt <4 x i32> %i16, zeroinitializer
158 %i18 = sub <4 x i32> zeroinitializer, %i16
159 %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16
160 %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19)
161 %i21 = shl <4 x i32> %i19, %i20
162 %i22 = add <4 x i32> %i20, <i32 1, i32 1, i32 1, i32 1>
163 %i23 = lshr <4 x i32> %i21, <i32 24, i32 24, i32 24, i32 24>
164 %i24 = and <4 x i32> %i23, <i32 63, i32 63, i32 63, i32 63>
165 %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0)
166 %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21)
167 %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i26)
168 %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27)
169 %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0)
170 %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21)
171 %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i30)
172 %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31)
173 %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0)
174 %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33)
175 %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34)
176 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i35, ptr %i9, i32 4, <4 x i1> %pred)
177 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i22, ptr %i7, i32 4, <4 x i1> %pred)
178 %i38 = getelementptr inbounds i32, ptr %i7, i32 4
179 %i39 = getelementptr inbounds i32, ptr %i11, i32 4
180 %i40 = getelementptr inbounds i32, ptr %i10, i32 4
181 %i41 = getelementptr inbounds i32, ptr %i9, i32 4
182 %i42 = add nsw i32 %i8, -4
183 %i43 = icmp sgt i32 %i8, 4
184 br i1 %i43, label %bb6, label %bb44
190 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
191 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
192 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
193 declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
194 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
195 declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>)
196 declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr, <4 x i32>, i32, i32, i32)
197 declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
198 declare <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32>, i32, i32)
199 declare <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32>, <4 x i1>, <4 x i32>)