1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
4 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
5 target triple = "aarch64"
7 %struct.weight_t = type { i32, i32 }
9 define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonly %src, %struct.weight_t* noalias nocapture readonly %w) {
10 ; CHECK-LABEL: @f_noalias(
12 ; CHECK-NEXT: [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0
13 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16
14 ; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
15 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4
16 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 1
17 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 1
18 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
19 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
20 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
21 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to <4 x i8>*
22 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1
23 ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
24 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
25 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
26 ; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP4]]
27 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
28 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer
29 ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP6]], [[SHUFFLE1]]
30 ; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x i32> [[TMP8]], <i32 256, i32 256, i32 256, i32 256>
31 ; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer
32 ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
33 ; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> [[TMP11]]
34 ; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
35 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
36 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[DST]] to <4 x i8>*
37 ; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1
38 ; CHECK-NEXT: ret void
41 %scale = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 0
42 %0 = load i32, i32* %scale, align 16
43 %offset = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 1
44 %1 = load i32, i32* %offset, align 4
45 %2 = load i8, i8* %src, align 1
46 %conv = zext i8 %2 to i32
47 %mul = mul nsw i32 %0, %conv
48 %add = add nsw i32 %mul, %1
49 %tobool.not.i = icmp ult i32 %add, 256
50 %3 = icmp sgt i32 %add, 0
51 %shr.i = sext i1 %3 to i32
52 %cond.i = select i1 %tobool.not.i, i32 %add, i32 %shr.i
53 %conv.i = trunc i32 %cond.i to i8
54 store i8 %conv.i, i8* %dst, align 1
55 %arrayidx.1 = getelementptr inbounds i8, i8* %src, i64 1
56 %4 = load i8, i8* %arrayidx.1, align 1
57 %conv.1 = zext i8 %4 to i32
58 %mul.1 = mul nsw i32 %0, %conv.1
59 %add.1 = add nsw i32 %mul.1, %1
60 %tobool.not.i.1 = icmp ult i32 %add.1, 256
61 %5 = icmp sgt i32 %add.1, 0
62 %shr.i.1 = sext i1 %5 to i32
63 %cond.i.1 = select i1 %tobool.not.i.1, i32 %add.1, i32 %shr.i.1
64 %conv.i.1 = trunc i32 %cond.i.1 to i8
65 %arrayidx2.1 = getelementptr inbounds i8, i8* %dst, i64 1
66 store i8 %conv.i.1, i8* %arrayidx2.1, align 1
67 %arrayidx.2 = getelementptr inbounds i8, i8* %src, i64 2
68 %6 = load i8, i8* %arrayidx.2, align 1
69 %conv.2 = zext i8 %6 to i32
70 %mul.2 = mul nsw i32 %0, %conv.2
71 %add.2 = add nsw i32 %mul.2, %1
72 %tobool.not.i.2 = icmp ult i32 %add.2, 256
73 %7 = icmp sgt i32 %add.2, 0
74 %shr.i.2 = sext i1 %7 to i32
75 %cond.i.2 = select i1 %tobool.not.i.2, i32 %add.2, i32 %shr.i.2
76 %conv.i.2 = trunc i32 %cond.i.2 to i8
77 %arrayidx2.2 = getelementptr inbounds i8, i8* %dst, i64 2
78 store i8 %conv.i.2, i8* %arrayidx2.2, align 1
79 %arrayidx.3 = getelementptr inbounds i8, i8* %src, i64 3
80 %8 = load i8, i8* %arrayidx.3, align 1
81 %conv.3 = zext i8 %8 to i32
82 %mul.3 = mul nsw i32 %0, %conv.3
83 %add.3 = add nsw i32 %mul.3, %1
84 %tobool.not.i.3 = icmp ult i32 %add.3, 256
85 %9 = icmp sgt i32 %add.3, 0
86 %shr.i.3 = sext i1 %9 to i32
87 %cond.i.3 = select i1 %tobool.not.i.3, i32 %add.3, i32 %shr.i.3
88 %conv.i.3 = trunc i32 %cond.i.3 to i8
89 %arrayidx2.3 = getelementptr inbounds i8, i8* %dst, i64 3
90 store i8 %conv.i.3, i8* %arrayidx2.3, align 1
94 ; This is the same test as above, expect that the pointers don't have 'noalias'.
95 ; This currently prevents SLP vectorization, but the SLP vectorizer should
96 ; be taught to emit runtime checks enabling vectorization.
98 define void @f_alias(i8* nocapture %dst, i8* nocapture readonly %src, %struct.weight_t* nocapture readonly %w) {
99 ; CHECK-LABEL: @f_alias(
101 ; CHECK-NEXT: [[SCALE:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T:%.*]], %struct.weight_t* [[W:%.*]], i64 0, i32 0
102 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16
103 ; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
104 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4
105 ; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1
106 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP2]] to i32
107 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]]
108 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]]
109 ; CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp ult i32 [[ADD]], 256
110 ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[ADD]], 0
111 ; CHECK-NEXT: [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32
112 ; CHECK-NEXT: [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]]
113 ; CHECK-NEXT: [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8
114 ; CHECK-NEXT: store i8 [[CONV_I]], i8* [[DST:%.*]], align 1
115 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1
116 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
117 ; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32
118 ; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[TMP0]], [[CONV_1]]
119 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[TMP1]]
120 ; CHECK-NEXT: [[TOBOOL_NOT_I_1:%.*]] = icmp ult i32 [[ADD_1]], 256
121 ; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[ADD_1]], 0
122 ; CHECK-NEXT: [[SHR_I_1:%.*]] = sext i1 [[TMP5]] to i32
123 ; CHECK-NEXT: [[COND_I_1:%.*]] = select i1 [[TOBOOL_NOT_I_1]], i32 [[ADD_1]], i32 [[SHR_I_1]]
124 ; CHECK-NEXT: [[CONV_I_1:%.*]] = trunc i32 [[COND_I_1]] to i8
125 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1
126 ; CHECK-NEXT: store i8 [[CONV_I_1]], i8* [[ARRAYIDX2_1]], align 1
127 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
128 ; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
129 ; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
130 ; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[TMP0]], [[CONV_2]]
131 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[MUL_2]], [[TMP1]]
132 ; CHECK-NEXT: [[TOBOOL_NOT_I_2:%.*]] = icmp ult i32 [[ADD_2]], 256
133 ; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[ADD_2]], 0
134 ; CHECK-NEXT: [[SHR_I_2:%.*]] = sext i1 [[TMP7]] to i32
135 ; CHECK-NEXT: [[COND_I_2:%.*]] = select i1 [[TOBOOL_NOT_I_2]], i32 [[ADD_2]], i32 [[SHR_I_2]]
136 ; CHECK-NEXT: [[CONV_I_2:%.*]] = trunc i32 [[COND_I_2]] to i8
137 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
138 ; CHECK-NEXT: store i8 [[CONV_I_2]], i8* [[ARRAYIDX2_2]], align 1
139 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
140 ; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
141 ; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP8]] to i32
142 ; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[TMP0]], [[CONV_3]]
143 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[MUL_3]], [[TMP1]]
144 ; CHECK-NEXT: [[TOBOOL_NOT_I_3:%.*]] = icmp ult i32 [[ADD_3]], 256
145 ; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[ADD_3]], 0
146 ; CHECK-NEXT: [[SHR_I_3:%.*]] = sext i1 [[TMP9]] to i32
147 ; CHECK-NEXT: [[COND_I_3:%.*]] = select i1 [[TOBOOL_NOT_I_3]], i32 [[ADD_3]], i32 [[SHR_I_3]]
148 ; CHECK-NEXT: [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8
149 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
150 ; CHECK-NEXT: store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1
151 ; CHECK-NEXT: ret void
154 %scale = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 0
155 %0 = load i32, i32* %scale, align 16
156 %offset = getelementptr inbounds %struct.weight_t, %struct.weight_t* %w, i64 0, i32 1
157 %1 = load i32, i32* %offset, align 4
158 %2 = load i8, i8* %src, align 1
159 %conv = zext i8 %2 to i32
160 %mul = mul nsw i32 %0, %conv
161 %add = add nsw i32 %mul, %1
162 %tobool.not.i = icmp ult i32 %add, 256
163 %3 = icmp sgt i32 %add, 0
164 %shr.i = sext i1 %3 to i32
165 %cond.i = select i1 %tobool.not.i, i32 %add, i32 %shr.i
166 %conv.i = trunc i32 %cond.i to i8
167 store i8 %conv.i, i8* %dst, align 1
168 %arrayidx.1 = getelementptr inbounds i8, i8* %src, i64 1
169 %4 = load i8, i8* %arrayidx.1, align 1
170 %conv.1 = zext i8 %4 to i32
171 %mul.1 = mul nsw i32 %0, %conv.1
172 %add.1 = add nsw i32 %mul.1, %1
173 %tobool.not.i.1 = icmp ult i32 %add.1, 256
174 %5 = icmp sgt i32 %add.1, 0
175 %shr.i.1 = sext i1 %5 to i32
176 %cond.i.1 = select i1 %tobool.not.i.1, i32 %add.1, i32 %shr.i.1
177 %conv.i.1 = trunc i32 %cond.i.1 to i8
178 %arrayidx2.1 = getelementptr inbounds i8, i8* %dst, i64 1
179 store i8 %conv.i.1, i8* %arrayidx2.1, align 1
180 %arrayidx.2 = getelementptr inbounds i8, i8* %src, i64 2
181 %6 = load i8, i8* %arrayidx.2, align 1
182 %conv.2 = zext i8 %6 to i32
183 %mul.2 = mul nsw i32 %0, %conv.2
184 %add.2 = add nsw i32 %mul.2, %1
185 %tobool.not.i.2 = icmp ult i32 %add.2, 256
186 %7 = icmp sgt i32 %add.2, 0
187 %shr.i.2 = sext i1 %7 to i32
188 %cond.i.2 = select i1 %tobool.not.i.2, i32 %add.2, i32 %shr.i.2
189 %conv.i.2 = trunc i32 %cond.i.2 to i8
190 %arrayidx2.2 = getelementptr inbounds i8, i8* %dst, i64 2
191 store i8 %conv.i.2, i8* %arrayidx2.2, align 1
192 %arrayidx.3 = getelementptr inbounds i8, i8* %src, i64 3
193 %8 = load i8, i8* %arrayidx.3, align 1
194 %conv.3 = zext i8 %8 to i32
195 %mul.3 = mul nsw i32 %0, %conv.3
196 %add.3 = add nsw i32 %mul.3, %1
197 %tobool.not.i.3 = icmp ult i32 %add.3, 256
198 %9 = icmp sgt i32 %add.3, 0
199 %shr.i.3 = sext i1 %9 to i32
200 %cond.i.3 = select i1 %tobool.not.i.3, i32 %add.3, i32 %shr.i.3
201 %conv.i.3 = trunc i32 %cond.i.3 to i8
202 %arrayidx2.3 = getelementptr inbounds i8, i8* %dst, i64 3
203 store i8 %conv.i.3, i8* %arrayidx2.3, align 1