1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=loop-vectorize -S -o - | FileCheck %s
3 ; RUN: opt < %s -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -o - | FileCheck --check-prefix=MAX-BW %s
5 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
6 target triple = "x86_64-unknown-linux-gnu"
8 ; This test checks that the given loop still beneficial for vecotization
9 ; even if it contains scalarized load (gather on AVX2)
11 ; Function Attrs: norecurse nounwind readonly uwtable
12 define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
13 ; CHECK-LABEL: @matrix_row_col(
14 ; CHECK-NEXT: iter.check:
15 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
16 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64
17 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
18 ; CHECK: vector.main.loop.iter.check:
19 ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
21 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
23 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
24 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ]
25 ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ]
26 ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ]
27 ; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ]
28 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
29 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
30 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
31 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
32 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
33 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
34 ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
35 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
36 ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
37 ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
38 ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
39 ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
40 ; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
41 ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
42 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
43 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
44 ; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
45 ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
46 ; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
47 ; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
48 ; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
49 ; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
50 ; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
51 ; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
52 ; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
53 ; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
54 ; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
55 ; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
56 ; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
57 ; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
58 ; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
59 ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
60 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
61 ; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
62 ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
63 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
64 ; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
65 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4, !tbaa [[TBAA1:![0-9]+]]
66 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
67 ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
68 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
69 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
70 ; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
71 ; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
72 ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
73 ; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
74 ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
75 ; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
76 ; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
77 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP8]], i64 [[IDXPROM5]]
78 ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP9]], i64 [[IDXPROM5]]
79 ; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP10]], i64 [[IDXPROM5]]
80 ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP11]], i64 [[IDXPROM5]]
81 ; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP12]], i64 [[IDXPROM5]]
82 ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP13]], i64 [[IDXPROM5]]
83 ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP14]], i64 [[IDXPROM5]]
84 ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP15]], i64 [[IDXPROM5]]
85 ; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP16]], i64 [[IDXPROM5]]
86 ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP17]], i64 [[IDXPROM5]]
87 ; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP18]], i64 [[IDXPROM5]]
88 ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP19]], i64 [[IDXPROM5]]
89 ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP20]], i64 [[IDXPROM5]]
90 ; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP21]], i64 [[IDXPROM5]]
91 ; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP22]], i64 [[IDXPROM5]]
92 ; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP23]], i64 [[IDXPROM5]]
93 ; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP24]], i64 [[IDXPROM5]]
94 ; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP25]], i64 [[IDXPROM5]]
95 ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP26]], i64 [[IDXPROM5]]
96 ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP27]], i64 [[IDXPROM5]]
97 ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP28]], i64 [[IDXPROM5]]
98 ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP29]], i64 [[IDXPROM5]]
99 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP30]], i64 [[IDXPROM5]]
100 ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP31]], i64 [[IDXPROM5]]
101 ; CHECK-NEXT: [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[TBAA1]]
102 ; CHECK-NEXT: [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[TBAA1]]
103 ; CHECK-NEXT: [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[TBAA1]]
104 ; CHECK-NEXT: [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA1]]
105 ; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[TBAA1]]
106 ; CHECK-NEXT: [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
107 ; CHECK-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
108 ; CHECK-NEXT: [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
109 ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
110 ; CHECK-NEXT: [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
111 ; CHECK-NEXT: [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
112 ; CHECK-NEXT: [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
113 ; CHECK-NEXT: [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
114 ; CHECK-NEXT: [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
115 ; CHECK-NEXT: [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
116 ; CHECK-NEXT: [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
117 ; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
118 ; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
119 ; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
120 ; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[TBAA1]]
121 ; CHECK-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[TBAA1]]
122 ; CHECK-NEXT: [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
123 ; CHECK-NEXT: [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
124 ; CHECK-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
125 ; CHECK-NEXT: [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
126 ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
127 ; CHECK-NEXT: [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
128 ; CHECK-NEXT: [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
129 ; CHECK-NEXT: [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
130 ; CHECK-NEXT: [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
131 ; CHECK-NEXT: [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
132 ; CHECK-NEXT: [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
133 ; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
134 ; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
135 ; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
136 ; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[TBAA1]]
137 ; CHECK-NEXT: [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[TBAA1]]
138 ; CHECK-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
139 ; CHECK-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
140 ; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
141 ; CHECK-NEXT: [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
142 ; CHECK-NEXT: [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
143 ; CHECK-NEXT: [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
144 ; CHECK-NEXT: [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
145 ; CHECK-NEXT: [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
146 ; CHECK-NEXT: [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
147 ; CHECK-NEXT: [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
148 ; CHECK-NEXT: [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
149 ; CHECK-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
150 ; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
151 ; CHECK-NEXT: [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
152 ; CHECK-NEXT: [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[TBAA1]]
153 ; CHECK-NEXT: [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[TBAA1]]
154 ; CHECK-NEXT: [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
155 ; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
156 ; CHECK-NEXT: [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
157 ; CHECK-NEXT: [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
158 ; CHECK-NEXT: [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
159 ; CHECK-NEXT: [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
160 ; CHECK-NEXT: [[TMP131:%.*]] = insertelement <8 x i32> [[TMP130]], i32 [[TMP123]], i32 3
161 ; CHECK-NEXT: [[TMP132:%.*]] = insertelement <8 x i32> [[TMP131]], i32 [[TMP124]], i32 4
162 ; CHECK-NEXT: [[TMP133:%.*]] = insertelement <8 x i32> [[TMP132]], i32 [[TMP125]], i32 5
163 ; CHECK-NEXT: [[TMP134:%.*]] = insertelement <8 x i32> [[TMP133]], i32 [[TMP126]], i32 6
164 ; CHECK-NEXT: [[TMP135:%.*]] = insertelement <8 x i32> [[TMP134]], i32 [[TMP127]], i32 7
165 ; CHECK-NEXT: [[TMP136:%.*]] = mul nsw <8 x i32> [[TMP87]], [[WIDE_LOAD]]
166 ; CHECK-NEXT: [[TMP137:%.*]] = mul nsw <8 x i32> [[TMP103]], [[WIDE_LOAD4]]
167 ; CHECK-NEXT: [[TMP138:%.*]] = mul nsw <8 x i32> [[TMP119]], [[WIDE_LOAD5]]
168 ; CHECK-NEXT: [[TMP139:%.*]] = mul nsw <8 x i32> [[TMP135]], [[WIDE_LOAD6]]
169 ; CHECK-NEXT: [[TMP140:%.*]] = add <8 x i32> [[VEC_PHI]], splat (i32 4)
170 ; CHECK-NEXT: [[TMP141:%.*]] = add <8 x i32> [[VEC_PHI1]], splat (i32 4)
171 ; CHECK-NEXT: [[TMP142:%.*]] = add <8 x i32> [[VEC_PHI2]], splat (i32 4)
172 ; CHECK-NEXT: [[TMP143:%.*]] = add <8 x i32> [[VEC_PHI3]], splat (i32 4)
173 ; CHECK-NEXT: [[TMP144]] = add <8 x i32> [[TMP140]], [[TMP136]]
174 ; CHECK-NEXT: [[TMP145]] = add <8 x i32> [[TMP141]], [[TMP137]]
175 ; CHECK-NEXT: [[TMP146]] = add <8 x i32> [[TMP142]], [[TMP138]]
176 ; CHECK-NEXT: [[TMP147]] = add <8 x i32> [[TMP143]], [[TMP139]]
177 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
178 ; CHECK-NEXT: [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
179 ; CHECK-NEXT: br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
180 ; CHECK: middle.block:
181 ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP145]], [[TMP144]]
182 ; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]]
183 ; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]]
184 ; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
185 ; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
186 ; CHECK: vec.epilog.iter.check:
187 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
188 ; CHECK: vec.epilog.ph:
189 ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
190 ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
191 ; CHECK-NEXT: [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
192 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
193 ; CHECK: vec.epilog.vector.body:
194 ; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[FOR_BODY]] ]
195 ; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], [[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], [[FOR_BODY]] ]
196 ; CHECK-NEXT: [[TMP172:%.*]] = add i64 [[INDEX9]], 0
197 ; CHECK-NEXT: [[TMP173:%.*]] = add i64 [[INDEX9]], 1
198 ; CHECK-NEXT: [[TMP174:%.*]] = add i64 [[INDEX9]], 2
199 ; CHECK-NEXT: [[TMP175:%.*]] = add i64 [[INDEX9]], 3
200 ; CHECK-NEXT: [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]]
201 ; CHECK-NEXT: [[TMP153:%.*]] = getelementptr inbounds i32, ptr [[TMP152]], i32 0
202 ; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP153]], align 4, !tbaa [[TBAA1]]
203 ; CHECK-NEXT: [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]]
204 ; CHECK-NEXT: [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]]
205 ; CHECK-NEXT: [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]]
206 ; CHECK-NEXT: [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP175]], i64 [[IDXPROM5]]
207 ; CHECK-NEXT: [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[TBAA1]]
208 ; CHECK-NEXT: [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[TBAA1]]
209 ; CHECK-NEXT: [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[TBAA1]]
210 ; CHECK-NEXT: [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]]
211 ; CHECK-NEXT: [[TMP162:%.*]] = insertelement <4 x i32> poison, i32 [[TMP158]], i32 0
212 ; CHECK-NEXT: [[TMP163:%.*]] = insertelement <4 x i32> [[TMP162]], i32 [[TMP159]], i32 1
213 ; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP163]], i32 [[TMP160]], i32 2
214 ; CHECK-NEXT: [[TMP165:%.*]] = insertelement <4 x i32> [[TMP164]], i32 [[TMP161]], i32 3
215 ; CHECK-NEXT: [[TMP166:%.*]] = mul nsw <4 x i32> [[TMP165]], [[WIDE_LOAD11]]
216 ; CHECK-NEXT: [[TMP167:%.*]] = add <4 x i32> [[VEC_PHI10]], splat (i32 4)
217 ; CHECK-NEXT: [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]]
218 ; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4
219 ; CHECK-NEXT: [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100
220 ; CHECK-NEXT: br i1 [[TMP169]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
221 ; CHECK: vec.epilog.middle.block:
222 ; CHECK-NEXT: [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]])
223 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
224 ; CHECK: vec.epilog.scalar.ph:
225 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
226 ; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ]
227 ; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
228 ; CHECK: for.cond.cleanup:
229 ; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
230 ; CHECK-NEXT: ret i32 [[ADD7_LCSSA]]
232 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
233 ; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY1]] ]
234 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
235 ; CHECK-NEXT: [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
236 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
237 ; CHECK-NEXT: [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
238 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]]
239 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4
240 ; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]]
241 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
242 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
243 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
245 ; MAX-BW-LABEL: @matrix_row_col(
246 ; MAX-BW-NEXT: iter.check:
247 ; MAX-BW-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
248 ; MAX-BW-NEXT: [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64
249 ; MAX-BW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
250 ; MAX-BW: vector.main.loop.iter.check:
251 ; MAX-BW-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
253 ; MAX-BW-NEXT: br label [[VECTOR_BODY:%.*]]
254 ; MAX-BW: vector.body:
255 ; MAX-BW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
256 ; MAX-BW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ]
257 ; MAX-BW-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ]
258 ; MAX-BW-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ]
259 ; MAX-BW-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ]
260 ; MAX-BW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
261 ; MAX-BW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
262 ; MAX-BW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
263 ; MAX-BW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
264 ; MAX-BW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
265 ; MAX-BW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
266 ; MAX-BW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
267 ; MAX-BW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
268 ; MAX-BW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8
269 ; MAX-BW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9
270 ; MAX-BW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10
271 ; MAX-BW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11
272 ; MAX-BW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12
273 ; MAX-BW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13
274 ; MAX-BW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14
275 ; MAX-BW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15
276 ; MAX-BW-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
277 ; MAX-BW-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17
278 ; MAX-BW-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18
279 ; MAX-BW-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19
280 ; MAX-BW-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20
281 ; MAX-BW-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21
282 ; MAX-BW-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22
283 ; MAX-BW-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23
284 ; MAX-BW-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24
285 ; MAX-BW-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25
286 ; MAX-BW-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26
287 ; MAX-BW-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27
288 ; MAX-BW-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28
289 ; MAX-BW-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29
290 ; MAX-BW-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30
291 ; MAX-BW-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31
292 ; MAX-BW-NEXT: [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
293 ; MAX-BW-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
294 ; MAX-BW-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
295 ; MAX-BW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
296 ; MAX-BW-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
297 ; MAX-BW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4, !tbaa [[TBAA1:![0-9]+]]
298 ; MAX-BW-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
299 ; MAX-BW-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
300 ; MAX-BW-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
301 ; MAX-BW-NEXT: [[TMP40:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
302 ; MAX-BW-NEXT: [[TMP41:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
303 ; MAX-BW-NEXT: [[TMP42:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
304 ; MAX-BW-NEXT: [[TMP43:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
305 ; MAX-BW-NEXT: [[TMP44:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
306 ; MAX-BW-NEXT: [[TMP45:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
307 ; MAX-BW-NEXT: [[TMP46:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
308 ; MAX-BW-NEXT: [[TMP47:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
309 ; MAX-BW-NEXT: [[TMP48:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP8]], i64 [[IDXPROM5]]
310 ; MAX-BW-NEXT: [[TMP49:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP9]], i64 [[IDXPROM5]]
311 ; MAX-BW-NEXT: [[TMP50:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP10]], i64 [[IDXPROM5]]
312 ; MAX-BW-NEXT: [[TMP51:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP11]], i64 [[IDXPROM5]]
313 ; MAX-BW-NEXT: [[TMP52:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP12]], i64 [[IDXPROM5]]
314 ; MAX-BW-NEXT: [[TMP53:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP13]], i64 [[IDXPROM5]]
315 ; MAX-BW-NEXT: [[TMP54:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP14]], i64 [[IDXPROM5]]
316 ; MAX-BW-NEXT: [[TMP55:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP15]], i64 [[IDXPROM5]]
317 ; MAX-BW-NEXT: [[TMP56:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP16]], i64 [[IDXPROM5]]
318 ; MAX-BW-NEXT: [[TMP57:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP17]], i64 [[IDXPROM5]]
319 ; MAX-BW-NEXT: [[TMP58:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP18]], i64 [[IDXPROM5]]
320 ; MAX-BW-NEXT: [[TMP59:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP19]], i64 [[IDXPROM5]]
321 ; MAX-BW-NEXT: [[TMP60:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP20]], i64 [[IDXPROM5]]
322 ; MAX-BW-NEXT: [[TMP61:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP21]], i64 [[IDXPROM5]]
323 ; MAX-BW-NEXT: [[TMP62:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP22]], i64 [[IDXPROM5]]
324 ; MAX-BW-NEXT: [[TMP63:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP23]], i64 [[IDXPROM5]]
325 ; MAX-BW-NEXT: [[TMP64:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP24]], i64 [[IDXPROM5]]
326 ; MAX-BW-NEXT: [[TMP65:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP25]], i64 [[IDXPROM5]]
327 ; MAX-BW-NEXT: [[TMP66:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP26]], i64 [[IDXPROM5]]
328 ; MAX-BW-NEXT: [[TMP67:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP27]], i64 [[IDXPROM5]]
329 ; MAX-BW-NEXT: [[TMP68:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP28]], i64 [[IDXPROM5]]
330 ; MAX-BW-NEXT: [[TMP69:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP29]], i64 [[IDXPROM5]]
331 ; MAX-BW-NEXT: [[TMP70:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP30]], i64 [[IDXPROM5]]
332 ; MAX-BW-NEXT: [[TMP71:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP31]], i64 [[IDXPROM5]]
333 ; MAX-BW-NEXT: [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[TBAA1]]
334 ; MAX-BW-NEXT: [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[TBAA1]]
335 ; MAX-BW-NEXT: [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[TBAA1]]
336 ; MAX-BW-NEXT: [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA1]]
337 ; MAX-BW-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[TBAA1]]
338 ; MAX-BW-NEXT: [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
339 ; MAX-BW-NEXT: [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
340 ; MAX-BW-NEXT: [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
341 ; MAX-BW-NEXT: [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
342 ; MAX-BW-NEXT: [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
343 ; MAX-BW-NEXT: [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
344 ; MAX-BW-NEXT: [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
345 ; MAX-BW-NEXT: [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
346 ; MAX-BW-NEXT: [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
347 ; MAX-BW-NEXT: [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
348 ; MAX-BW-NEXT: [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
349 ; MAX-BW-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
350 ; MAX-BW-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
351 ; MAX-BW-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
352 ; MAX-BW-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[TBAA1]]
353 ; MAX-BW-NEXT: [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[TBAA1]]
354 ; MAX-BW-NEXT: [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
355 ; MAX-BW-NEXT: [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
356 ; MAX-BW-NEXT: [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
357 ; MAX-BW-NEXT: [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
358 ; MAX-BW-NEXT: [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
359 ; MAX-BW-NEXT: [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
360 ; MAX-BW-NEXT: [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
361 ; MAX-BW-NEXT: [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
362 ; MAX-BW-NEXT: [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
363 ; MAX-BW-NEXT: [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
364 ; MAX-BW-NEXT: [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
365 ; MAX-BW-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
366 ; MAX-BW-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
367 ; MAX-BW-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
368 ; MAX-BW-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[TBAA1]]
369 ; MAX-BW-NEXT: [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[TBAA1]]
370 ; MAX-BW-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
371 ; MAX-BW-NEXT: [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
372 ; MAX-BW-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
373 ; MAX-BW-NEXT: [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
374 ; MAX-BW-NEXT: [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
375 ; MAX-BW-NEXT: [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
376 ; MAX-BW-NEXT: [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
377 ; MAX-BW-NEXT: [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
378 ; MAX-BW-NEXT: [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
379 ; MAX-BW-NEXT: [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
380 ; MAX-BW-NEXT: [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
381 ; MAX-BW-NEXT: [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
382 ; MAX-BW-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
383 ; MAX-BW-NEXT: [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
384 ; MAX-BW-NEXT: [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[TBAA1]]
385 ; MAX-BW-NEXT: [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[TBAA1]]
386 ; MAX-BW-NEXT: [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
387 ; MAX-BW-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
388 ; MAX-BW-NEXT: [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
389 ; MAX-BW-NEXT: [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
390 ; MAX-BW-NEXT: [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
391 ; MAX-BW-NEXT: [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
392 ; MAX-BW-NEXT: [[TMP131:%.*]] = insertelement <8 x i32> [[TMP130]], i32 [[TMP123]], i32 3
393 ; MAX-BW-NEXT: [[TMP132:%.*]] = insertelement <8 x i32> [[TMP131]], i32 [[TMP124]], i32 4
394 ; MAX-BW-NEXT: [[TMP133:%.*]] = insertelement <8 x i32> [[TMP132]], i32 [[TMP125]], i32 5
395 ; MAX-BW-NEXT: [[TMP134:%.*]] = insertelement <8 x i32> [[TMP133]], i32 [[TMP126]], i32 6
396 ; MAX-BW-NEXT: [[TMP135:%.*]] = insertelement <8 x i32> [[TMP134]], i32 [[TMP127]], i32 7
397 ; MAX-BW-NEXT: [[TMP136:%.*]] = mul nsw <8 x i32> [[TMP87]], [[WIDE_LOAD]]
398 ; MAX-BW-NEXT: [[TMP137:%.*]] = mul nsw <8 x i32> [[TMP103]], [[WIDE_LOAD4]]
399 ; MAX-BW-NEXT: [[TMP138:%.*]] = mul nsw <8 x i32> [[TMP119]], [[WIDE_LOAD5]]
400 ; MAX-BW-NEXT: [[TMP139:%.*]] = mul nsw <8 x i32> [[TMP135]], [[WIDE_LOAD6]]
401 ; MAX-BW-NEXT: [[TMP140:%.*]] = add <8 x i32> [[VEC_PHI]], splat (i32 4)
402 ; MAX-BW-NEXT: [[TMP141:%.*]] = add <8 x i32> [[VEC_PHI1]], splat (i32 4)
403 ; MAX-BW-NEXT: [[TMP142:%.*]] = add <8 x i32> [[VEC_PHI2]], splat (i32 4)
404 ; MAX-BW-NEXT: [[TMP143:%.*]] = add <8 x i32> [[VEC_PHI3]], splat (i32 4)
405 ; MAX-BW-NEXT: [[TMP144]] = add <8 x i32> [[TMP140]], [[TMP136]]
406 ; MAX-BW-NEXT: [[TMP145]] = add <8 x i32> [[TMP141]], [[TMP137]]
407 ; MAX-BW-NEXT: [[TMP146]] = add <8 x i32> [[TMP142]], [[TMP138]]
408 ; MAX-BW-NEXT: [[TMP147]] = add <8 x i32> [[TMP143]], [[TMP139]]
409 ; MAX-BW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
410 ; MAX-BW-NEXT: [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
411 ; MAX-BW-NEXT: br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
412 ; MAX-BW: middle.block:
413 ; MAX-BW-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP145]], [[TMP144]]
414 ; MAX-BW-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]]
415 ; MAX-BW-NEXT: [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]]
416 ; MAX-BW-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
417 ; MAX-BW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
418 ; MAX-BW: vec.epilog.iter.check:
419 ; MAX-BW-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
420 ; MAX-BW: vec.epilog.ph:
421 ; MAX-BW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
422 ; MAX-BW-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
423 ; MAX-BW-NEXT: [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
424 ; MAX-BW-NEXT: br label [[FOR_BODY:%.*]]
425 ; MAX-BW: vec.epilog.vector.body:
426 ; MAX-BW-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[FOR_BODY]] ]
427 ; MAX-BW-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], [[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], [[FOR_BODY]] ]
428 ; MAX-BW-NEXT: [[TMP172:%.*]] = add i64 [[INDEX9]], 0
429 ; MAX-BW-NEXT: [[TMP173:%.*]] = add i64 [[INDEX9]], 1
430 ; MAX-BW-NEXT: [[TMP174:%.*]] = add i64 [[INDEX9]], 2
431 ; MAX-BW-NEXT: [[TMP175:%.*]] = add i64 [[INDEX9]], 3
432 ; MAX-BW-NEXT: [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]]
433 ; MAX-BW-NEXT: [[TMP153:%.*]] = getelementptr inbounds i32, ptr [[TMP152]], i32 0
434 ; MAX-BW-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP153]], align 4, !tbaa [[TBAA1]]
435 ; MAX-BW-NEXT: [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]]
436 ; MAX-BW-NEXT: [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]]
437 ; MAX-BW-NEXT: [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]]
438 ; MAX-BW-NEXT: [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP175]], i64 [[IDXPROM5]]
439 ; MAX-BW-NEXT: [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[TBAA1]]
440 ; MAX-BW-NEXT: [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[TBAA1]]
441 ; MAX-BW-NEXT: [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[TBAA1]]
442 ; MAX-BW-NEXT: [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]]
443 ; MAX-BW-NEXT: [[TMP162:%.*]] = insertelement <4 x i32> poison, i32 [[TMP158]], i32 0
444 ; MAX-BW-NEXT: [[TMP163:%.*]] = insertelement <4 x i32> [[TMP162]], i32 [[TMP159]], i32 1
445 ; MAX-BW-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP163]], i32 [[TMP160]], i32 2
446 ; MAX-BW-NEXT: [[TMP165:%.*]] = insertelement <4 x i32> [[TMP164]], i32 [[TMP161]], i32 3
447 ; MAX-BW-NEXT: [[TMP166:%.*]] = mul nsw <4 x i32> [[TMP165]], [[WIDE_LOAD11]]
448 ; MAX-BW-NEXT: [[TMP167:%.*]] = add <4 x i32> [[VEC_PHI10]], splat (i32 4)
449 ; MAX-BW-NEXT: [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]]
450 ; MAX-BW-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4
451 ; MAX-BW-NEXT: [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100
452 ; MAX-BW-NEXT: br i1 [[TMP169]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
453 ; MAX-BW: vec.epilog.middle.block:
454 ; MAX-BW-NEXT: [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]])
455 ; MAX-BW-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
456 ; MAX-BW: vec.epilog.scalar.ph:
457 ; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
458 ; MAX-BW-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ]
459 ; MAX-BW-NEXT: br label [[FOR_BODY1:%.*]]
460 ; MAX-BW: for.cond.cleanup:
461 ; MAX-BW-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
462 ; MAX-BW-NEXT: ret i32 [[ADD7_LCSSA]]
464 ; MAX-BW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
465 ; MAX-BW-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY1]] ]
466 ; MAX-BW-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
467 ; MAX-BW-NEXT: [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
468 ; MAX-BW-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
469 ; MAX-BW-NEXT: [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
470 ; MAX-BW-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]]
471 ; MAX-BW-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4
472 ; MAX-BW-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]]
473 ; MAX-BW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
474 ; MAX-BW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
475 ; MAX-BW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
478 %idxprom = sext i32 %i to i64
479 %idxprom5 = sext i32 %j to i64
482 for.cond.cleanup: ; preds = %for.body
485 for.body: ; preds = %for.body, %entry
486 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
487 %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
488 ; first consecutive load as vector load
489 %arrayidx2 = getelementptr inbounds [100 x i32], ptr %data, i64 %idxprom, i64 %indvars.iv
490 %0 = load i32, ptr %arrayidx2, align 4, !tbaa !1
491 ; second strided load scalarized
492 %arrayidx6 = getelementptr inbounds [100 x i32], ptr %data, i64 %indvars.iv, i64 %idxprom5
493 %1 = load i32, ptr %arrayidx6, align 4, !tbaa !1
494 %mul = mul nsw i32 %1, %0
495 %add = add i32 %sum.015, 4
496 %add7 = add i32 %add, %mul
497 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
498 %exitcond = icmp eq i64 %indvars.iv.next, 100
499 br i1 %exitcond, label %for.cond.cleanup, label %for.body
502 define void @test(ptr %A, ptr noalias %B) #0 {
503 ; CHECK-LABEL: @test(
505 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
507 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
508 ; CHECK: vector.body:
509 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
510 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
511 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
512 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
513 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
514 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
515 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
516 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
517 ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
518 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
519 ; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP0]], 0
520 ; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP1]], 0
521 ; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP2]], 0
522 ; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[TMP3]], 0
523 ; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP4]], 0
524 ; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP5]], 0
525 ; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 0
526 ; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP7]], 0
527 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP8]]
528 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP16]], align 4
529 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
530 ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
531 ; CHECK-NEXT: [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
532 ; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
533 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP8]]
534 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]]
535 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]]
536 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP11]]
537 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP12]]
538 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]]
539 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]]
540 ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]]
541 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
542 ; CHECK-NEXT: store i8 [[TMP28]], ptr [[TMP20]], align 1
543 ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
544 ; CHECK-NEXT: store i8 [[TMP29]], ptr [[TMP21]], align 1
545 ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
546 ; CHECK-NEXT: store i8 [[TMP30]], ptr [[TMP22]], align 1
547 ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
548 ; CHECK-NEXT: store i8 [[TMP31]], ptr [[TMP23]], align 1
549 ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
550 ; CHECK-NEXT: store i8 [[TMP32]], ptr [[TMP24]], align 1
551 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
552 ; CHECK-NEXT: store i8 [[TMP33]], ptr [[TMP25]], align 1
553 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
554 ; CHECK-NEXT: store i8 [[TMP34]], ptr [[TMP26]], align 1
555 ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
556 ; CHECK-NEXT: store i8 [[TMP35]], ptr [[TMP27]], align 1
557 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
558 ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
559 ; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
560 ; CHECK: middle.block:
561 ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
563 ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
564 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
566 ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
567 ; CHECK-NEXT: [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
568 ; CHECK-NEXT: [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
569 ; CHECK-NEXT: [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
570 ; CHECK-NEXT: [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
571 ; CHECK-NEXT: [[V0:%.*]] = load i32, ptr [[IN0]], align 4
572 ; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[IN1]], align 4
573 ; CHECK-NEXT: [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
574 ; CHECK-NEXT: [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
575 ; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
576 ; CHECK-NEXT: store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
577 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
578 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
579 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
580 ; CHECK: for.cond.cleanup:
581 ; CHECK-NEXT: ret void
583 ; MAX-BW-LABEL: @test(
584 ; MAX-BW-NEXT: entry:
585 ; MAX-BW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
587 ; MAX-BW-NEXT: br label [[VECTOR_BODY:%.*]]
588 ; MAX-BW: vector.body:
589 ; MAX-BW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
590 ; MAX-BW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
591 ; MAX-BW-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
592 ; MAX-BW-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
593 ; MAX-BW-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
594 ; MAX-BW-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
595 ; MAX-BW-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
596 ; MAX-BW-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
597 ; MAX-BW-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
598 ; MAX-BW-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
599 ; MAX-BW-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16
600 ; MAX-BW-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18
601 ; MAX-BW-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20
602 ; MAX-BW-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22
603 ; MAX-BW-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24
604 ; MAX-BW-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
605 ; MAX-BW-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
606 ; MAX-BW-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
607 ; MAX-BW-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[TMP0]], 0
608 ; MAX-BW-NEXT: [[TMP17:%.*]] = add nuw nsw i64 [[TMP1]], 0
609 ; MAX-BW-NEXT: [[TMP18:%.*]] = add nuw nsw i64 [[TMP2]], 0
610 ; MAX-BW-NEXT: [[TMP19:%.*]] = add nuw nsw i64 [[TMP3]], 0
611 ; MAX-BW-NEXT: [[TMP20:%.*]] = add nuw nsw i64 [[TMP4]], 0
612 ; MAX-BW-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[TMP5]], 0
613 ; MAX-BW-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[TMP6]], 0
614 ; MAX-BW-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[TMP7]], 0
615 ; MAX-BW-NEXT: [[TMP24:%.*]] = add nuw nsw i64 [[TMP8]], 0
616 ; MAX-BW-NEXT: [[TMP25:%.*]] = add nuw nsw i64 [[TMP9]], 0
617 ; MAX-BW-NEXT: [[TMP26:%.*]] = add nuw nsw i64 [[TMP10]], 0
618 ; MAX-BW-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[TMP11]], 0
619 ; MAX-BW-NEXT: [[TMP28:%.*]] = add nuw nsw i64 [[TMP12]], 0
620 ; MAX-BW-NEXT: [[TMP29:%.*]] = add nuw nsw i64 [[TMP13]], 0
621 ; MAX-BW-NEXT: [[TMP30:%.*]] = add nuw nsw i64 [[TMP14]], 0
622 ; MAX-BW-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[TMP15]], 0
623 ; MAX-BW-NEXT: [[TMP32:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP16]]
624 ; MAX-BW-NEXT: [[WIDE_VEC:%.*]] = load <32 x i32>, ptr [[TMP32]], align 4
625 ; MAX-BW-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
626 ; MAX-BW-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
627 ; MAX-BW-NEXT: [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
628 ; MAX-BW-NEXT: [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
629 ; MAX-BW-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]]
630 ; MAX-BW-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
631 ; MAX-BW-NEXT: [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
632 ; MAX-BW-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
633 ; MAX-BW-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
634 ; MAX-BW-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
635 ; MAX-BW-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
636 ; MAX-BW-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
637 ; MAX-BW-NEXT: [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
638 ; MAX-BW-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
639 ; MAX-BW-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
640 ; MAX-BW-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
641 ; MAX-BW-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
642 ; MAX-BW-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
643 ; MAX-BW-NEXT: [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
644 ; MAX-BW-NEXT: [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
645 ; MAX-BW-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0
646 ; MAX-BW-NEXT: store i8 [[TMP52]], ptr [[TMP36]], align 1
647 ; MAX-BW-NEXT: [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1
648 ; MAX-BW-NEXT: store i8 [[TMP53]], ptr [[TMP37]], align 1
649 ; MAX-BW-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2
650 ; MAX-BW-NEXT: store i8 [[TMP54]], ptr [[TMP38]], align 1
651 ; MAX-BW-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3
652 ; MAX-BW-NEXT: store i8 [[TMP55]], ptr [[TMP39]], align 1
653 ; MAX-BW-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4
654 ; MAX-BW-NEXT: store i8 [[TMP56]], ptr [[TMP40]], align 1
655 ; MAX-BW-NEXT: [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5
656 ; MAX-BW-NEXT: store i8 [[TMP57]], ptr [[TMP41]], align 1
657 ; MAX-BW-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6
658 ; MAX-BW-NEXT: store i8 [[TMP58]], ptr [[TMP42]], align 1
659 ; MAX-BW-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7
660 ; MAX-BW-NEXT: store i8 [[TMP59]], ptr [[TMP43]], align 1
661 ; MAX-BW-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8
662 ; MAX-BW-NEXT: store i8 [[TMP60]], ptr [[TMP44]], align 1
663 ; MAX-BW-NEXT: [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9
664 ; MAX-BW-NEXT: store i8 [[TMP61]], ptr [[TMP45]], align 1
665 ; MAX-BW-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10
666 ; MAX-BW-NEXT: store i8 [[TMP62]], ptr [[TMP46]], align 1
667 ; MAX-BW-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11
668 ; MAX-BW-NEXT: store i8 [[TMP63]], ptr [[TMP47]], align 1
669 ; MAX-BW-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12
670 ; MAX-BW-NEXT: store i8 [[TMP64]], ptr [[TMP48]], align 1
671 ; MAX-BW-NEXT: [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13
672 ; MAX-BW-NEXT: store i8 [[TMP65]], ptr [[TMP49]], align 1
673 ; MAX-BW-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14
674 ; MAX-BW-NEXT: store i8 [[TMP66]], ptr [[TMP50]], align 1
675 ; MAX-BW-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15
676 ; MAX-BW-NEXT: store i8 [[TMP67]], ptr [[TMP51]], align 1
677 ; MAX-BW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
678 ; MAX-BW-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
679 ; MAX-BW-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
680 ; MAX-BW: middle.block:
681 ; MAX-BW-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
683 ; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
684 ; MAX-BW-NEXT: br label [[FOR_BODY:%.*]]
686 ; MAX-BW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
687 ; MAX-BW-NEXT: [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
688 ; MAX-BW-NEXT: [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
689 ; MAX-BW-NEXT: [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
690 ; MAX-BW-NEXT: [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
691 ; MAX-BW-NEXT: [[V0:%.*]] = load i32, ptr [[IN0]], align 4
692 ; MAX-BW-NEXT: [[V1:%.*]] = load i32, ptr [[IN1]], align 4
693 ; MAX-BW-NEXT: [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
694 ; MAX-BW-NEXT: [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
695 ; MAX-BW-NEXT: [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
696 ; MAX-BW-NEXT: store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
697 ; MAX-BW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
698 ; MAX-BW-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
699 ; MAX-BW-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
700 ; MAX-BW: for.cond.cleanup:
701 ; MAX-BW-NEXT: ret void
707 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
709 %iv.0 = add nuw nsw i64 %iv, 0
710 %iv.1 = add nuw nsw i64 %iv, 1
712 %in0 = getelementptr inbounds [1024 x i32], ptr %A, i64 0, i64 %iv.0
713 %in1 = getelementptr inbounds [1024 x i32], ptr %A, i64 0, i64 %iv.1
715 %v0 = load i32, ptr %in0
716 %v1 = load i32, ptr %in1
718 %reduce.add.0 = add i32 %v0, %v1
720 %reduce.add.0.narrow = trunc i32 %reduce.add.0 to i8
722 %out = getelementptr inbounds [1024 x i8], ptr %B, i64 0, i64 %iv.0
723 store i8 %reduce.add.0.narrow, ptr %out
725 %iv.next = add nuw nsw i64 %iv.0, 2
726 %cmp = icmp ult i64 %iv.next, 1024
727 br i1 %cmp, label %for.body, label %for.cond.cleanup
733 attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }
737 !0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"}
738 !1 = !{!2, !2, i64 0}
739 !2 = !{!"int", !3, i64 0}
740 !3 = !{!"omnipotent char", !4, i64 0}
741 !4 = !{!"Simple C/C++ TBAA"}