1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
4 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX1
5 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
6 ; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE41
8 define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, i32 %n) {
9 ; SSE2-LABEL: @test_muladd(
11 ; SSE2-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
12 ; SSE2-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
13 ; SSE2: for.body.preheader:
14 ; SSE2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
15 ; SSE2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
16 ; SSE2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
18 ; SSE2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
19 ; SSE2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
20 ; SSE2-NEXT: br label [[VECTOR_BODY:%.*]]
22 ; SSE2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
23 ; SSE2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
24 ; SSE2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
25 ; SSE2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP1]]
26 ; SSE2-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
27 ; SSE2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
28 ; SSE2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
29 ; SSE2-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
30 ; SSE2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]]
31 ; SSE2-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
32 ; SSE2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
33 ; SSE2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
34 ; SSE2-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
35 ; SSE2-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[TMP7]], [[TMP4]]
36 ; SSE2-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
37 ; SSE2-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
38 ; SSE2-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP9]]
39 ; SSE2-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP8]]
40 ; SSE2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
41 ; SSE2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
42 ; SSE2-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
43 ; SSE2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
44 ; SSE2-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
45 ; SSE2-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
47 ; SSE2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
48 ; SSE2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
50 ; SSE2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
51 ; SSE2-NEXT: br label [[FOR_BODY:%.*]]
53 ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
54 ; SSE2-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
55 ; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]]
56 ; SSE2-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
57 ; SSE2-NEXT: [[CONV:%.*]] = sext i16 [[TMP17]] to i32
58 ; SSE2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]]
59 ; SSE2-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
60 ; SSE2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP18]] to i32
61 ; SSE2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
62 ; SSE2-NEXT: [[TMP19:%.*]] = or disjoint i64 [[TMP16]], 1
63 ; SSE2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]]
64 ; SSE2-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
65 ; SSE2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP20]] to i32
66 ; SSE2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]]
67 ; SSE2-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
68 ; SSE2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP21]] to i32
69 ; SSE2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
70 ; SSE2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
71 ; SSE2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
72 ; SSE2-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
73 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
74 ; SSE2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
75 ; SSE2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
76 ; SSE2: for.end.loopexit:
77 ; SSE2-NEXT: br label [[FOR_END]]
81 ; SSE41-LABEL: @test_muladd(
83 ; SSE41-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
84 ; SSE41-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
85 ; SSE41: for.body.preheader:
86 ; SSE41-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
87 ; SSE41-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
88 ; SSE41-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
90 ; SSE41-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
91 ; SSE41-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
92 ; SSE41-NEXT: br label [[VECTOR_BODY:%.*]]
94 ; SSE41-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
95 ; SSE41-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
96 ; SSE41-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
97 ; SSE41-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP0]], 1
98 ; SSE41-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
99 ; SSE41-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP2]]
100 ; SSE41-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP3]]
101 ; SSE41-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
102 ; SSE41-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
103 ; SSE41-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
104 ; SSE41-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
105 ; SSE41-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
106 ; SSE41-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
107 ; SSE41-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
108 ; SSE41-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC2]] to <4 x i32>
109 ; SSE41-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP2]]
110 ; SSE41-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP3]]
111 ; SSE41-NEXT: [[WIDE_VEC5:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
112 ; SSE41-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
113 ; SSE41-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC5]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
114 ; SSE41-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
115 ; SSE41-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
116 ; SSE41-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
117 ; SSE41-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
118 ; SSE41-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
119 ; SSE41-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP8]]
120 ; SSE41-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP9]]
121 ; SSE41-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
122 ; SSE41-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
123 ; SSE41-NEXT: [[TMP20:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
124 ; SSE41-NEXT: [[TMP21:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
125 ; SSE41-NEXT: [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP18]]
126 ; SSE41-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP19]]
127 ; SSE41-NEXT: [[TMP24:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP16]]
128 ; SSE41-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP17]]
129 ; SSE41-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
130 ; SSE41-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
131 ; SSE41-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 4
132 ; SSE41-NEXT: store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4
133 ; SSE41-NEXT: store <4 x i32> [[TMP25]], ptr [[TMP29]], align 4
134 ; SSE41-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
135 ; SSE41-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
136 ; SSE41-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
137 ; SSE41: middle.block:
138 ; SSE41-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
139 ; SSE41-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
141 ; SSE41-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
142 ; SSE41-NEXT: br label [[FOR_BODY:%.*]]
144 ; SSE41-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
145 ; SSE41-NEXT: [[TMP31:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
146 ; SSE41-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP31]]
147 ; SSE41-NEXT: [[TMP32:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
148 ; SSE41-NEXT: [[CONV:%.*]] = sext i16 [[TMP32]] to i32
149 ; SSE41-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP31]]
150 ; SSE41-NEXT: [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
151 ; SSE41-NEXT: [[CONV5:%.*]] = sext i16 [[TMP33]] to i32
152 ; SSE41-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
153 ; SSE41-NEXT: [[TMP34:%.*]] = or disjoint i64 [[TMP31]], 1
154 ; SSE41-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP34]]
155 ; SSE41-NEXT: [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
156 ; SSE41-NEXT: [[CONV11:%.*]] = sext i16 [[TMP35]] to i32
157 ; SSE41-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP34]]
158 ; SSE41-NEXT: [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
159 ; SSE41-NEXT: [[CONV16:%.*]] = sext i16 [[TMP36]] to i32
160 ; SSE41-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
161 ; SSE41-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
162 ; SSE41-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
163 ; SSE41-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
164 ; SSE41-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
165 ; SSE41-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
166 ; SSE41-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
167 ; SSE41: for.end.loopexit:
168 ; SSE41-NEXT: br label [[FOR_END]]
170 ; SSE41-NEXT: ret void
172 ; AVX1-LABEL: @test_muladd(
174 ; AVX1-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
175 ; AVX1-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
177 ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
178 ; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
179 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
180 ; AVX1: vector.main.loop.iter.check:
181 ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
182 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
184 ; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
185 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
186 ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
188 ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
189 ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
190 ; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
191 ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
192 ; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
193 ; AVX1-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP0]], 1
194 ; AVX1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP1]], 1
195 ; AVX1-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP2]], 1
196 ; AVX1-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP3]], 1
197 ; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP4]]
198 ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]]
199 ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]]
200 ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]]
201 ; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP8]], align 2
202 ; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
203 ; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
204 ; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
205 ; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
206 ; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
207 ; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
208 ; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
209 ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
210 ; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
211 ; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
212 ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
213 ; AVX1-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
214 ; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
215 ; AVX1-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
216 ; AVX1-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
217 ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]]
218 ; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]]
219 ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]]
220 ; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]]
221 ; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP20]], align 2
222 ; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
223 ; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
224 ; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
225 ; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
226 ; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
227 ; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP22]], align 2
228 ; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
229 ; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
230 ; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2
231 ; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
232 ; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
233 ; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32>
234 ; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32>
235 ; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
236 ; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
237 ; AVX1-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP16]]
238 ; AVX1-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP29]], [[TMP17]]
239 ; AVX1-NEXT: [[TMP34:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP18]]
240 ; AVX1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP19]]
241 ; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
242 ; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
243 ; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
244 ; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
245 ; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
246 ; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
247 ; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
248 ; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
249 ; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]]
250 ; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]]
251 ; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]]
252 ; AVX1-NEXT: [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]]
253 ; AVX1-NEXT: [[TMP48:%.*]] = add nsw <4 x i32> [[TMP44]], [[TMP32]]
254 ; AVX1-NEXT: [[TMP49:%.*]] = add nsw <4 x i32> [[TMP45]], [[TMP33]]
255 ; AVX1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP34]]
256 ; AVX1-NEXT: [[TMP51:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP35]]
257 ; AVX1-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
258 ; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0
259 ; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4
260 ; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8
261 ; AVX1-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12
262 ; AVX1-NEXT: store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4
263 ; AVX1-NEXT: store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4
264 ; AVX1-NEXT: store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4
265 ; AVX1-NEXT: store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4
266 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
267 ; AVX1-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
268 ; AVX1-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
269 ; AVX1: middle.block:
270 ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
271 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
272 ; AVX1: vec.epilog.iter.check:
273 ; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
274 ; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
275 ; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
276 ; AVX1: vec.epilog.ph:
277 ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
278 ; AVX1-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
279 ; AVX1-NEXT: [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]]
280 ; AVX1-NEXT: br label [[FOR_BODY:%.*]]
281 ; AVX1: vec.epilog.vector.body:
282 ; AVX1-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY]] ]
283 ; AVX1-NEXT: [[TMP67:%.*]] = add i64 [[INDEX26]], 0
284 ; AVX1-NEXT: [[TMP68:%.*]] = shl nuw nsw i64 [[TMP67]], 1
285 ; AVX1-NEXT: [[TMP69:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP68]]
286 ; AVX1-NEXT: [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP69]], align 2
287 ; AVX1-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
288 ; AVX1-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
289 ; AVX1-NEXT: [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32>
290 ; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP68]]
291 ; AVX1-NEXT: [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP54]], align 2
292 ; AVX1-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
293 ; AVX1-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
294 ; AVX1-NEXT: [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32>
295 ; AVX1-NEXT: [[TMP70:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP53]]
296 ; AVX1-NEXT: [[TMP71:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32>
297 ; AVX1-NEXT: [[TMP72:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32>
298 ; AVX1-NEXT: [[TMP73:%.*]] = mul nsw <4 x i32> [[TMP72]], [[TMP71]]
299 ; AVX1-NEXT: [[TMP74:%.*]] = add nsw <4 x i32> [[TMP73]], [[TMP70]]
300 ; AVX1-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP67]]
301 ; AVX1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP75]], i32 0
302 ; AVX1-NEXT: store <4 x i32> [[TMP74]], ptr [[TMP76]], align 4
303 ; AVX1-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4
304 ; AVX1-NEXT: [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]]
305 ; AVX1-NEXT: br i1 [[TMP77]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
306 ; AVX1: vec.epilog.middle.block:
307 ; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]]
308 ; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
309 ; AVX1: vec.epilog.scalar.ph:
310 ; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
311 ; AVX1-NEXT: br label [[FOR_BODY1:%.*]]
313 ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
314 ; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
315 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]]
316 ; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
317 ; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP62]] to i32
318 ; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP61]]
319 ; AVX1-NEXT: [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
320 ; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP63]] to i32
321 ; AVX1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
322 ; AVX1-NEXT: [[TMP64:%.*]] = or disjoint i64 [[TMP61]], 1
323 ; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP64]]
324 ; AVX1-NEXT: [[TMP65:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
325 ; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP65]] to i32
326 ; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP64]]
327 ; AVX1-NEXT: [[TMP66:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
328 ; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP66]] to i32
329 ; AVX1-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
330 ; AVX1-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
331 ; AVX1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
332 ; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
333 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
334 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
335 ; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
336 ; AVX1: for.end.loopexit:
337 ; AVX1-NEXT: br label [[FOR_END]]
339 ; AVX1-NEXT: ret void
341 ; AVX2-LABEL: @test_muladd(
343 ; AVX2-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
344 ; AVX2-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
345 ; AVX2: for.body.preheader:
346 ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
347 ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
348 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
350 ; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
351 ; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
352 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
354 ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
355 ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
356 ; AVX2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
357 ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP1]]
358 ; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP2]], align 2
359 ; AVX2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
360 ; AVX2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
361 ; AVX2-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i32>
362 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP1]]
363 ; AVX2-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2
364 ; AVX2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
365 ; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
366 ; AVX2-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32>
367 ; AVX2-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP4]]
368 ; AVX2-NEXT: [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32>
369 ; AVX2-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32>
370 ; AVX2-NEXT: [[TMP11:%.*]] = mul nsw <8 x i32> [[TMP10]], [[TMP9]]
371 ; AVX2-NEXT: [[TMP12:%.*]] = add nsw <8 x i32> [[TMP11]], [[TMP8]]
372 ; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
373 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
374 ; AVX2-NEXT: store <8 x i32> [[TMP12]], ptr [[TMP14]], align 4
375 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
376 ; AVX2-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
377 ; AVX2-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
378 ; AVX2: middle.block:
379 ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
380 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
382 ; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
383 ; AVX2-NEXT: br label [[FOR_BODY:%.*]]
385 ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
386 ; AVX2-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
387 ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP16]]
388 ; AVX2-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
389 ; AVX2-NEXT: [[CONV:%.*]] = sext i16 [[TMP17]] to i32
390 ; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP16]]
391 ; AVX2-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
392 ; AVX2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP18]] to i32
393 ; AVX2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
394 ; AVX2-NEXT: [[TMP19:%.*]] = or disjoint i64 [[TMP16]], 1
395 ; AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP19]]
396 ; AVX2-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
397 ; AVX2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP20]] to i32
398 ; AVX2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP19]]
399 ; AVX2-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
400 ; AVX2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP21]] to i32
401 ; AVX2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
402 ; AVX2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
403 ; AVX2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDVARS_IV]]
404 ; AVX2-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
405 ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
406 ; AVX2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
407 ; AVX2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
408 ; AVX2: for.end.loopexit:
409 ; AVX2-NEXT: br label [[FOR_END]]
411 ; AVX2-NEXT: ret void
414 %cmp30 = icmp sgt i32 %n, 0
415 br i1 %cmp30, label %for.body.preheader, label %for.end
418 %wide.trip.count = zext i32 %n to i64
422 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
423 %0 = shl nuw nsw i64 %indvars.iv, 1
424 %arrayidx = getelementptr inbounds i16, ptr %s1, i64 %0
425 %1 = load i16, ptr %arrayidx, align 2
426 %conv = sext i16 %1 to i32
427 %arrayidx4 = getelementptr inbounds i16, ptr %s2, i64 %0
428 %2 = load i16, ptr %arrayidx4, align 2
429 %conv5 = sext i16 %2 to i32
430 %mul6 = mul nsw i32 %conv5, %conv
431 %3 = or disjoint i64 %0, 1
432 %arrayidx10 = getelementptr inbounds i16, ptr %s1, i64 %3
433 %4 = load i16, ptr %arrayidx10, align 2
434 %conv11 = sext i16 %4 to i32
435 %arrayidx15 = getelementptr inbounds i16, ptr %s2, i64 %3
436 %5 = load i16, ptr %arrayidx15, align 2
437 %conv16 = sext i16 %5 to i32
438 %mul17 = mul nsw i32 %conv16, %conv11
439 %add18 = add nsw i32 %mul17, %mul6
440 %arrayidx20 = getelementptr inbounds i32, ptr %d1, i64 %indvars.iv
441 store i32 %add18, ptr %arrayidx20, align 4
442 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
443 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
444 br i1 %exitcond.not, label %for.end.loopexit, label %for.body