1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2 ; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
3 ; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
4 ; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
6 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
7 target triple = "aarch64-none-unknown-elf"
9 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
10 define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
11 ; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_sub(
12 ; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
13 ; CHECK-NEON-NEXT: entry:
14 ; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
15 ; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
16 ; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
17 ; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
18 ; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
19 ; CHECK-NEON: vector.ph:
20 ; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
21 ; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
22 ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
23 ; CHECK-NEON: vector.body:
24 ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
25 ; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
26 ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
27 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
28 ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
29 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
30 ; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
31 ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
32 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
33 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
34 ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
35 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
36 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
37 ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
38 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
39 ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
40 ; CHECK-NEON-NEXT: [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]]
41 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
42 ; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]]
43 ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
44 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
45 ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
46 ; CHECK-NEON: middle.block:
47 ; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
48 ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
49 ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
51 ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub(
52 ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
53 ; CHECK-SVE-NEXT: entry:
54 ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
55 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
56 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
57 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
58 ; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
59 ; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
60 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
61 ; CHECK-SVE: vector.ph:
62 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
63 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
64 ; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
65 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
66 ; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
67 ; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
68 ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
69 ; CHECK-SVE: vector.body:
70 ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
71 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
72 ; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
73 ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
74 ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
75 ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
76 ; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
77 ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
78 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
79 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
80 ; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
81 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
82 ; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
83 ; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
84 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
85 ; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
86 ; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
87 ; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
88 ; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
89 ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
90 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
91 ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
92 ; CHECK-SVE: middle.block:
93 ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
94 ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
95 ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
97 ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub(
98 ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
99 ; CHECK-SVE-MAXBW-NEXT: entry:
100 ; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
101 ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
102 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
103 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
104 ; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
105 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
106 ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
107 ; CHECK-SVE-MAXBW: vector.ph:
108 ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
109 ; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
110 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
111 ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
112 ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
113 ; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
114 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
115 ; CHECK-SVE-MAXBW: vector.body:
116 ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
117 ; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
118 ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
119 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
120 ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
121 ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
122 ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
123 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
124 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
125 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
126 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
127 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
128 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
129 ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
130 ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
131 ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
132 ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
133 ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
134 ; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub <vscale x 8 x i32> [[TMP17]], [[TMP18]]
135 ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
136 ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
137 ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
138 ; CHECK-SVE-MAXBW: middle.block:
139 ; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
140 ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
141 ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
144 %cmp28.not = icmp ult i32 %N, 2
145 %div27 = lshr i32 %N, 1
146 %wide.trip.count = zext nneg i32 %div27 to i64
149 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
150 %res.0.lcssa = phi i32 [ %sub, %for.body ]
153 for.body: ; preds = %for.body.preheader, %for.body
154 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
155 %res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
156 %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
157 %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
158 %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
159 %a.val = load i8, ptr %a.ptr, align 1
160 %b.val = load i8, ptr %b.ptr, align 1
161 %c.val = load i8, ptr %c.ptr, align 1
162 %a.ext = sext i8 %a.val to i32
163 %b.ext = sext i8 %b.val to i32
164 %c.ext = sext i8 %c.val to i32
165 %mul.ab = mul nsw i32 %a.ext, %b.ext
166 %add = add nsw i32 %res, %mul.ab
167 %mul.ac = mul nsw i32 %a.ext, %c.ext
168 %sub = sub i32 %add, %mul.ac
169 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
170 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
171 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
174 define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
175 ; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add(
176 ; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
177 ; CHECK-NEON-NEXT: entry:
178 ; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
179 ; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
180 ; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
181 ; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
182 ; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
183 ; CHECK-NEON: vector.ph:
184 ; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
185 ; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
186 ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
187 ; CHECK-NEON: vector.body:
188 ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
189 ; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
190 ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
191 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
192 ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
193 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
194 ; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
195 ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
196 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
197 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
198 ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
199 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
200 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
201 ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
202 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
203 ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
204 ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
205 ; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
206 ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
207 ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
208 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
209 ; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
210 ; CHECK-NEON: middle.block:
211 ; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
212 ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
213 ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
215 ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add(
216 ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
217 ; CHECK-SVE-NEXT: entry:
218 ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
219 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
220 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
221 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
222 ; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
223 ; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
224 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
225 ; CHECK-SVE: vector.ph:
226 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
227 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
228 ; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
229 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
230 ; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
231 ; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
232 ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
233 ; CHECK-SVE: vector.body:
234 ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
235 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
236 ; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
237 ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
238 ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
239 ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
240 ; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
241 ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
242 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
243 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
244 ; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
245 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
246 ; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
247 ; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
248 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
249 ; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
250 ; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
251 ; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
252 ; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
253 ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
254 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
255 ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
256 ; CHECK-SVE: middle.block:
257 ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
258 ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
259 ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
261 ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add(
262 ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
263 ; CHECK-SVE-MAXBW-NEXT: entry:
264 ; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
265 ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
266 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
267 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
268 ; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
269 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
270 ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
271 ; CHECK-SVE-MAXBW: vector.ph:
272 ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
273 ; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
274 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
275 ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
276 ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
277 ; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
278 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
279 ; CHECK-SVE-MAXBW: vector.body:
280 ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
281 ; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
282 ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
283 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
284 ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
285 ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
286 ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
287 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
288 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
289 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
290 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
291 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
292 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
293 ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
294 ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
295 ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
296 ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
297 ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
298 ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
299 ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
300 ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
301 ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
302 ; CHECK-SVE-MAXBW: middle.block:
303 ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
304 ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
305 ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
308 %cmp28.not = icmp ult i32 %N, 2
309 %div27 = lshr i32 %N, 1
310 %wide.trip.count = zext nneg i32 %div27 to i64
313 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
314 %res.0.lcssa = phi i32 [ %add.2, %for.body ]
317 for.body: ; preds = %for.body.preheader, %for.body
318 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
319 %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ]
320 %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
321 %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
322 %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
323 %a.val = load i8, ptr %a.ptr, align 1
324 %b.val = load i8, ptr %b.ptr, align 1
325 %c.val = load i8, ptr %c.ptr, align 1
326 %a.ext = sext i8 %a.val to i32
327 %b.ext = sext i8 %b.val to i32
328 %c.ext = sext i8 %c.val to i32
329 %mul.ab = mul nsw i32 %a.ext, %b.ext
330 %add = add nsw i32 %res, %mul.ab
331 %mul.ac = mul nsw i32 %a.ext, %c.ext
332 %add.2 = add i32 %add, %mul.ac
333 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
334 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
335 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
338 define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
339 ; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add(
340 ; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
341 ; CHECK-NEON-NEXT: entry:
342 ; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
343 ; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
344 ; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
345 ; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
346 ; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
347 ; CHECK-NEON: vector.ph:
348 ; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
349 ; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
350 ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
351 ; CHECK-NEON: vector.body:
352 ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
353 ; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
354 ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
355 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
356 ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
357 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
358 ; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
359 ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
360 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
361 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
362 ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
363 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
364 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
365 ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
366 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
367 ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
368 ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
369 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
370 ; CHECK-NEON-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[TMP12]]
371 ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
372 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
373 ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
374 ; CHECK-NEON: middle.block:
375 ; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
376 ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
377 ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
379 ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add(
380 ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
381 ; CHECK-SVE-NEXT: entry:
382 ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
383 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
384 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
385 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
386 ; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
387 ; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
388 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
389 ; CHECK-SVE: vector.ph:
390 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
391 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
392 ; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
393 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
394 ; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
395 ; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
396 ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
397 ; CHECK-SVE: vector.body:
398 ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
399 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
400 ; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
401 ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
402 ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
403 ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
404 ; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
405 ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
406 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
407 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
408 ; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
409 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
410 ; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
411 ; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
412 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
413 ; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
414 ; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
415 ; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
416 ; CHECK-SVE-NEXT: [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
417 ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
418 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
419 ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
420 ; CHECK-SVE: middle.block:
421 ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
422 ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
423 ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
425 ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add(
426 ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
427 ; CHECK-SVE-MAXBW-NEXT: entry:
428 ; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
429 ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
430 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
431 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
432 ; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
433 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
434 ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
435 ; CHECK-SVE-MAXBW: vector.ph:
436 ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
437 ; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
438 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
439 ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
440 ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
441 ; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
442 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
443 ; CHECK-SVE-MAXBW: vector.body:
444 ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
445 ; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
446 ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
447 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
448 ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
449 ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
450 ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
451 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
452 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
453 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
454 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
455 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
456 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
457 ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
458 ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
459 ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
460 ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
461 ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
462 ; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = add <vscale x 8 x i32> [[TMP17]], [[TMP18]]
463 ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
464 ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
465 ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
466 ; CHECK-SVE-MAXBW: middle.block:
467 ; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
468 ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
469 ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
472 %cmp28.not = icmp ult i32 %N, 2
473 %div27 = lshr i32 %N, 1
474 %wide.trip.count = zext nneg i32 %div27 to i64
477 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
478 %res.0.lcssa = phi i32 [ %add, %for.body ]
481 for.body: ; preds = %for.body.preheader, %for.body
482 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
483 %res = phi i32 [ 0, %entry ], [ %add, %for.body ]
485 %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
486 %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
487 %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
488 %a.val = load i8, ptr %a.ptr, align 1
489 %b.val = load i8, ptr %b.ptr, align 1
490 %c.val = load i8, ptr %c.ptr, align 1
492 %a.ext = sext i8 %a.val to i32
493 %b.ext = sext i8 %b.val to i32
494 %c.ext = sext i8 %c.val to i32
495 %mul.ab = mul nsw i32 %a.ext, %b.ext
496 %sub = sub nsw i32 %res, %mul.ab
497 %mul.ac = mul nsw i32 %a.ext, %c.ext
498 %add = add i32 %sub, %mul.ac
499 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
500 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
501 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
504 define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
505 ; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_sub(
506 ; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
507 ; CHECK-NEON-NEXT: entry:
508 ; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
509 ; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
510 ; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
511 ; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
512 ; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
513 ; CHECK-NEON: vector.ph:
514 ; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
515 ; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
516 ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
517 ; CHECK-NEON: vector.body:
518 ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
519 ; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
520 ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
521 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
522 ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
523 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
524 ; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
525 ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
526 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
527 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
528 ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
529 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
530 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
531 ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
532 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
533 ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
534 ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
535 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
536 ; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]]
537 ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
538 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
539 ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
540 ; CHECK-NEON: middle.block:
541 ; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
542 ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
543 ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
545 ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub(
546 ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
547 ; CHECK-SVE-NEXT: entry:
548 ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
549 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
550 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
551 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
552 ; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
553 ; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
554 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
555 ; CHECK-SVE: vector.ph:
556 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
557 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
558 ; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
559 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
560 ; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
561 ; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
562 ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
563 ; CHECK-SVE: vector.body:
564 ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
565 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
566 ; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
567 ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
568 ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
569 ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
570 ; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
571 ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
572 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
573 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
574 ; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
575 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
576 ; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
577 ; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
578 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
579 ; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
580 ; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
581 ; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
582 ; CHECK-SVE-NEXT: [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
583 ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
584 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
585 ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
586 ; CHECK-SVE: middle.block:
587 ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
588 ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
589 ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
591 ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub(
592 ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
593 ; CHECK-SVE-MAXBW-NEXT: entry:
594 ; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
595 ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
596 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
597 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
598 ; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
599 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
600 ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
601 ; CHECK-SVE-MAXBW: vector.ph:
602 ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
603 ; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
604 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
605 ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
606 ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
607 ; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
608 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
609 ; CHECK-SVE-MAXBW: vector.body:
610 ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
611 ; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
612 ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
613 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
614 ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
615 ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
616 ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
617 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
618 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
619 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
620 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
621 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
622 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
623 ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
624 ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
625 ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
626 ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
627 ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
628 ; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub <vscale x 8 x i32> [[TMP17]], [[TMP18]]
629 ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
630 ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
631 ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
632 ; CHECK-SVE-MAXBW: middle.block:
633 ; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
634 ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
635 ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
638 %cmp28.not = icmp ult i32 %N, 2
639 %div27 = lshr i32 %N, 1
640 %wide.trip.count = zext nneg i32 %div27 to i64
643 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
644 %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
647 for.body: ; preds = %for.body.preheader, %for.body
648 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
649 %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
651 %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
652 %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
653 %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
654 %a.val = load i8, ptr %a.ptr, align 1
655 %b.val = load i8, ptr %b.ptr, align 1
656 %c.val = load i8, ptr %c.ptr, align 1
658 %a.ext = sext i8 %a.val to i32
659 %b.ext = sext i8 %b.val to i32
660 %c.ext = sext i8 %c.val to i32
662 %mul.ab = mul nsw i32 %a.ext, %b.ext
663 %sub = sub nsw i32 %res, %mul.ab
664 %mul.ac = mul nsw i32 %a.ext, %c.ext
665 %sub.2 = sub i32 %sub, %mul.ac
666 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
667 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
668 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
671 define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
672 ; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add_add(
673 ; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
674 ; CHECK-NEON-NEXT: entry:
675 ; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
676 ; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
677 ; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
678 ; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
679 ; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
680 ; CHECK-NEON: vector.ph:
681 ; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
682 ; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
683 ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
684 ; CHECK-NEON: vector.body:
685 ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
686 ; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
687 ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
688 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
689 ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
690 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
691 ; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
692 ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
693 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
694 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
695 ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
696 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
697 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
698 ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
699 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
700 ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
701 ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
702 ; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
703 ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
704 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
705 ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
706 ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
707 ; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
708 ; CHECK-NEON-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
709 ; CHECK-NEON: middle.block:
710 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
711 ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
712 ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
714 ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add(
715 ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
716 ; CHECK-SVE-NEXT: entry:
717 ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
718 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
719 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
720 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
721 ; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
722 ; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
723 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
724 ; CHECK-SVE: vector.ph:
725 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
726 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
727 ; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
728 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
729 ; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
730 ; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
731 ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
732 ; CHECK-SVE: vector.body:
733 ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
734 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
735 ; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
736 ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
737 ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
738 ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
739 ; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
740 ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
741 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
742 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
743 ; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
744 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
745 ; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
746 ; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
747 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
748 ; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
749 ; CHECK-SVE-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
750 ; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
751 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
752 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
753 ; CHECK-SVE-NEXT: [[TMP21]] = add <vscale x 4 x i32> [[TMP19]], [[TMP20]]
754 ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
755 ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
756 ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
757 ; CHECK-SVE: middle.block:
758 ; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
759 ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
760 ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
762 ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add(
763 ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
764 ; CHECK-SVE-MAXBW-NEXT: entry:
765 ; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
766 ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
767 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
768 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
769 ; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
770 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
771 ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
772 ; CHECK-SVE-MAXBW: vector.ph:
773 ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
774 ; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
775 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
776 ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
777 ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
778 ; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
779 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
780 ; CHECK-SVE-MAXBW: vector.body:
781 ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
782 ; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
783 ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
784 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
785 ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
786 ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
787 ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
788 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
789 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
790 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
791 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
792 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
793 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
794 ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
795 ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
796 ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP14]]
797 ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP16]])
798 ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP15]]
799 ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP17]])
800 ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 16 x i32> [[TMP14]], [[TMP15]]
801 ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]], <vscale x 16 x i32> [[TMP18]])
802 ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
803 ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
804 ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
805 ; CHECK-SVE-MAXBW: middle.block:
806 ; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]])
807 ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
808 ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
811 %cmp28.not = icmp ult i32 %N, 2
812 %div27 = lshr i32 %N, 1
813 %wide.trip.count = zext nneg i32 %div27 to i64
816 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
817 %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
820 for.body: ; preds = %for.body.preheader, %for.body
821 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
822 %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
824 %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
825 %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
826 %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
827 %a.val = load i8, ptr %a.ptr, align 1
828 %b.val = load i8, ptr %b.ptr, align 1
829 %c.val = load i8, ptr %c.ptr, align 1
831 %a.ext = sext i8 %a.val to i32
832 %b.ext = sext i8 %b.val to i32
833 %c.ext = sext i8 %c.val to i32
835 %mul.ab = mul nsw i32 %a.ext, %b.ext
836 %sub = add nsw i32 %res, %mul.ab
837 %mul.ac = mul nsw i32 %a.ext, %c.ext
838 %add = add nsw i32 %sub, %mul.ac
839 %mul.bc = mul nsw i32 %b.ext, %c.ext
840 %sub.2 = add i32 %add, %mul.bc
841 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
842 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
843 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
846 define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
847 ; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
848 ; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
849 ; CHECK-NEON-NEXT: entry:
850 ; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
851 ; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
852 ; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
853 ; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
854 ; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
855 ; CHECK-NEON: vector.ph:
856 ; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
857 ; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
858 ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
859 ; CHECK-NEON: vector.body:
860 ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
861 ; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
862 ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
863 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
864 ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
865 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
866 ; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
867 ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
868 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
869 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
870 ; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
871 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
872 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
873 ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
874 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
875 ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
876 ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
877 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
878 ; CHECK-NEON-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP11]], [[TMP12]]
879 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
880 ; CHECK-NEON-NEXT: [[TMP15]] = sub <16 x i32> [[TMP13]], [[TMP14]]
881 ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
882 ; CHECK-NEON-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
883 ; CHECK-NEON-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
884 ; CHECK-NEON: middle.block:
885 ; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
886 ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
887 ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
889 ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
890 ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
891 ; CHECK-SVE-NEXT: entry:
892 ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
893 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
894 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
895 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
896 ; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
897 ; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
898 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
899 ; CHECK-SVE: vector.ph:
900 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
901 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
902 ; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
903 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
904 ; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
905 ; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
906 ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
907 ; CHECK-SVE: vector.body:
908 ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
909 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
910 ; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
911 ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
912 ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
913 ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
914 ; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
915 ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
916 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
917 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
918 ; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
919 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
920 ; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
921 ; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
922 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
923 ; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
924 ; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
925 ; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
926 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
927 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
928 ; CHECK-SVE-NEXT: [[TMP21]] = sub <vscale x 4 x i32> [[TMP19]], [[TMP20]]
929 ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
930 ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
931 ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
932 ; CHECK-SVE: middle.block:
933 ; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
934 ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
935 ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
937 ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
938 ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
939 ; CHECK-SVE-MAXBW-NEXT: entry:
940 ; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
941 ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
942 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
943 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
944 ; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
945 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
946 ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
947 ; CHECK-SVE-MAXBW: vector.ph:
948 ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
949 ; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
950 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
951 ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
952 ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
953 ; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
954 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
955 ; CHECK-SVE-MAXBW: vector.body:
956 ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
957 ; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
958 ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
959 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
960 ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
961 ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
962 ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
963 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
964 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
965 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
966 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
967 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
968 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
969 ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
970 ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
971 ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
972 ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
973 ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
974 ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = add <vscale x 8 x i32> [[TMP17]], [[TMP18]]
975 ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
976 ; CHECK-SVE-MAXBW-NEXT: [[TMP21]] = sub <vscale x 8 x i32> [[TMP19]], [[TMP20]]
977 ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
978 ; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
979 ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
980 ; CHECK-SVE-MAXBW: middle.block:
981 ; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP21]])
982 ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
983 ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
986 %cmp28.not = icmp ult i32 %N, 2
987 %div27 = lshr i32 %N, 1
988 %wide.trip.count = zext nneg i32 %div27 to i64
991 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
992 %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
995 for.body: ; preds = %for.body.preheader, %for.body
996 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
997 %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
999 %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
1000 %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
1001 %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
1002 %a.val = load i8, ptr %a.ptr, align 1
1003 %b.val = load i8, ptr %b.ptr, align 1
1004 %c.val = load i8, ptr %c.ptr, align 1
1006 %a.ext = sext i8 %a.val to i32
1007 %b.ext = sext i8 %b.val to i32
1008 %c.ext = sext i8 %c.val to i32
1010 %mul.ab = mul nsw i32 %a.ext, %b.ext
1011 %sub = sub nsw i32 %res, %mul.ab
1012 %mul.ac = mul nsw i32 %a.ext, %c.ext
1013 %add = add nsw i32 %sub, %mul.ac
1014 %mul.bc = mul nsw i32 %b.ext, %c.ext
1015 %sub.2 = sub i32 %add, %mul.bc
1016 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1017 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
1018 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
1021 attributes #0 = { vscale_range(1,16) }
1024 !0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}