1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
7 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
13 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
14 ; SSE2-LABEL: @loadext_2i8_to_2i64(
15 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
16 ; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
17 ; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
18 ; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
19 ; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
20 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
21 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
22 ; SSE2-NEXT: ret <2 x i64> [[V1]]
24 ; SLM-LABEL: @loadext_2i8_to_2i64(
25 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
26 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
27 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
28 ; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
29 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
30 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
31 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
32 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
33 ; SLM-NEXT: ret <2 x i64> [[V1]]
35 ; AVX-LABEL: @loadext_2i8_to_2i64(
36 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
37 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
38 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
39 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
40 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
41 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
42 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
43 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
44 ; AVX-NEXT: ret <2 x i64> [[V1]]
46 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
47 %i0 = load i8, i8* %p0, align 1
48 %i1 = load i8, i8* %p1, align 1
49 %x0 = zext i8 %i0 to i64
50 %x1 = zext i8 %i1 to i64
51 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
52 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
56 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
57 ; CHECK-LABEL: @loadext_4i8_to_4i32(
58 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
59 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
60 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
61 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
62 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
63 ; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
64 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
65 ; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
66 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
67 ; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
68 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
69 ; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
70 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
71 ; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
72 ; CHECK-NEXT: ret <4 x i32> [[V3]]
74 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
75 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
76 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
77 %i0 = load i8, i8* %p0, align 1
78 %i1 = load i8, i8* %p1, align 1
79 %i2 = load i8, i8* %p2, align 1
80 %i3 = load i8, i8* %p3, align 1
81 %x0 = zext i8 %i0 to i32
82 %x1 = zext i8 %i1 to i32
83 %x2 = zext i8 %i2 to i32
84 %x3 = zext i8 %i3 to i32
85 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
86 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1
87 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2
88 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3
92 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
93 ; SSE2-LABEL: @loadext_4i8_to_4i64(
94 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
95 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
96 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
97 ; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
98 ; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
99 ; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
100 ; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
101 ; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
102 ; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
103 ; SSE2-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
104 ; SSE2-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
105 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
106 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
107 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
108 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
109 ; SSE2-NEXT: ret <4 x i64> [[V3]]
111 ; SLM-LABEL: @loadext_4i8_to_4i64(
112 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
113 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
114 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
115 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
116 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
117 ; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
118 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
119 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
120 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
121 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
122 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
123 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
124 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
125 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
126 ; SLM-NEXT: ret <4 x i64> [[V3]]
128 ; AVX-LABEL: @loadext_4i8_to_4i64(
129 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
130 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
131 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
132 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
133 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
134 ; AVX-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
135 ; AVX-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
136 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
137 ; AVX-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
138 ; AVX-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
139 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
140 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
141 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
142 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
143 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
144 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
145 ; AVX-NEXT: ret <4 x i64> [[V3]]
147 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
148 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
149 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
150 %i0 = load i8, i8* %p0, align 1
151 %i1 = load i8, i8* %p1, align 1
152 %i2 = load i8, i8* %p2, align 1
153 %i3 = load i8, i8* %p3, align 1
154 %x0 = zext i8 %i0 to i64
155 %x1 = zext i8 %i1 to i64
156 %x2 = zext i8 %i2 to i64
157 %x3 = zext i8 %i3 to i64
158 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
159 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
160 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
161 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3
165 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
166 ; CHECK-LABEL: @loadext_8i8_to_8i16(
167 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
168 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
169 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
170 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
171 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
172 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
173 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
174 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
175 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
176 ; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
177 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
178 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
179 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
180 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
181 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
182 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
183 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
184 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
185 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
186 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
187 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
188 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
189 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
190 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
191 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
192 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
193 ; CHECK-NEXT: ret <8 x i16> [[V7]]
195 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
196 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
197 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
198 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
199 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
200 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
201 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
202 %i0 = load i8, i8* %p0, align 1
203 %i1 = load i8, i8* %p1, align 1
204 %i2 = load i8, i8* %p2, align 1
205 %i3 = load i8, i8* %p3, align 1
206 %i4 = load i8, i8* %p4, align 1
207 %i5 = load i8, i8* %p5, align 1
208 %i6 = load i8, i8* %p6, align 1
209 %i7 = load i8, i8* %p7, align 1
210 %x0 = zext i8 %i0 to i16
211 %x1 = zext i8 %i1 to i16
212 %x2 = zext i8 %i2 to i16
213 %x3 = zext i8 %i3 to i16
214 %x4 = zext i8 %i4 to i16
215 %x5 = zext i8 %i5 to i16
216 %x6 = zext i8 %i6 to i16
217 %x7 = zext i8 %i7 to i16
218 %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
219 %v1 = insertelement <8 x i16> %v0, i16 %x1, i32 1
220 %v2 = insertelement <8 x i16> %v1, i16 %x2, i32 2
221 %v3 = insertelement <8 x i16> %v2, i16 %x3, i32 3
222 %v4 = insertelement <8 x i16> %v3, i16 %x4, i32 4
223 %v5 = insertelement <8 x i16> %v4, i16 %x5, i32 5
224 %v6 = insertelement <8 x i16> %v5, i16 %x6, i32 6
225 %v7 = insertelement <8 x i16> %v6, i16 %x7, i32 7
229 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
230 ; CHECK-LABEL: @loadext_8i8_to_8i32(
231 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
232 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
233 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
234 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
235 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
236 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
237 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
238 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
239 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
240 ; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
241 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
242 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
243 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
244 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
245 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
246 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
247 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
248 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
249 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
250 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
251 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
252 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
253 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
254 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
255 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
256 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
257 ; CHECK-NEXT: ret <8 x i32> [[V7]]
259 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
260 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
261 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
262 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
263 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
264 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
265 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
266 %i0 = load i8, i8* %p0, align 1
267 %i1 = load i8, i8* %p1, align 1
268 %i2 = load i8, i8* %p2, align 1
269 %i3 = load i8, i8* %p3, align 1
270 %i4 = load i8, i8* %p4, align 1
271 %i5 = load i8, i8* %p5, align 1
272 %i6 = load i8, i8* %p6, align 1
273 %i7 = load i8, i8* %p7, align 1
274 %x0 = zext i8 %i0 to i32
275 %x1 = zext i8 %i1 to i32
276 %x2 = zext i8 %i2 to i32
277 %x3 = zext i8 %i3 to i32
278 %x4 = zext i8 %i4 to i32
279 %x5 = zext i8 %i5 to i32
280 %x6 = zext i8 %i6 to i32
281 %x7 = zext i8 %i7 to i32
282 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
283 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1
284 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2
285 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3
286 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4
287 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5
288 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6
289 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7
293 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
294 ; CHECK-LABEL: @loadext_16i8_to_16i16(
295 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
296 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
297 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
298 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
299 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
300 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
301 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
302 ; CHECK-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
303 ; CHECK-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
304 ; CHECK-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
305 ; CHECK-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
306 ; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
307 ; CHECK-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
308 ; CHECK-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
309 ; CHECK-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
310 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
311 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
312 ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
313 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
314 ; CHECK-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
315 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
316 ; CHECK-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
317 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
318 ; CHECK-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
319 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
320 ; CHECK-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
321 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
322 ; CHECK-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
323 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
324 ; CHECK-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
325 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
326 ; CHECK-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
327 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
328 ; CHECK-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
329 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
330 ; CHECK-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
331 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
332 ; CHECK-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
333 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
334 ; CHECK-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
335 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
336 ; CHECK-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
337 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
338 ; CHECK-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
339 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
340 ; CHECK-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
341 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
342 ; CHECK-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
343 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
344 ; CHECK-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
345 ; CHECK-NEXT: ret <16 x i16> [[V15]]
347 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
348 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
349 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
350 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
351 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
352 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
353 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
354 %p8 = getelementptr inbounds i8, i8* %p0, i64 8
355 %p9 = getelementptr inbounds i8, i8* %p0, i64 9
356 %p10 = getelementptr inbounds i8, i8* %p0, i64 10
357 %p11 = getelementptr inbounds i8, i8* %p0, i64 11
358 %p12 = getelementptr inbounds i8, i8* %p0, i64 12
359 %p13 = getelementptr inbounds i8, i8* %p0, i64 13
360 %p14 = getelementptr inbounds i8, i8* %p0, i64 14
361 %p15 = getelementptr inbounds i8, i8* %p0, i64 15
362 %i0 = load i8, i8* %p0, align 1
363 %i1 = load i8, i8* %p1, align 1
364 %i2 = load i8, i8* %p2, align 1
365 %i3 = load i8, i8* %p3, align 1
366 %i4 = load i8, i8* %p4, align 1
367 %i5 = load i8, i8* %p5, align 1
368 %i6 = load i8, i8* %p6, align 1
369 %i7 = load i8, i8* %p7, align 1
370 %i8 = load i8, i8* %p8, align 1
371 %i9 = load i8, i8* %p9, align 1
372 %i10 = load i8, i8* %p10, align 1
373 %i11 = load i8, i8* %p11, align 1
374 %i12 = load i8, i8* %p12, align 1
375 %i13 = load i8, i8* %p13, align 1
376 %i14 = load i8, i8* %p14, align 1
377 %i15 = load i8, i8* %p15, align 1
378 %x0 = zext i8 %i0 to i16
379 %x1 = zext i8 %i1 to i16
380 %x2 = zext i8 %i2 to i16
381 %x3 = zext i8 %i3 to i16
382 %x4 = zext i8 %i4 to i16
383 %x5 = zext i8 %i5 to i16
384 %x6 = zext i8 %i6 to i16
385 %x7 = zext i8 %i7 to i16
386 %x8 = zext i8 %i8 to i16
387 %x9 = zext i8 %i9 to i16
388 %x10 = zext i8 %i10 to i16
389 %x11 = zext i8 %i11 to i16
390 %x12 = zext i8 %i12 to i16
391 %x13 = zext i8 %i13 to i16
392 %x14 = zext i8 %i14 to i16
393 %x15 = zext i8 %i15 to i16
394 %v0 = insertelement <16 x i16> undef, i16 %x0, i32 0
395 %v1 = insertelement <16 x i16> %v0, i16 %x1, i32 1
396 %v2 = insertelement <16 x i16> %v1, i16 %x2, i32 2
397 %v3 = insertelement <16 x i16> %v2, i16 %x3, i32 3
398 %v4 = insertelement <16 x i16> %v3, i16 %x4, i32 4
399 %v5 = insertelement <16 x i16> %v4, i16 %x5, i32 5
400 %v6 = insertelement <16 x i16> %v5, i16 %x6, i32 6
401 %v7 = insertelement <16 x i16> %v6, i16 %x7, i32 7
402 %v8 = insertelement <16 x i16> %v7, i16 %x8, i32 8
403 %v9 = insertelement <16 x i16> %v8, i16 %x9, i32 9
404 %v10 = insertelement <16 x i16> %v9, i16 %x10, i32 10
405 %v11 = insertelement <16 x i16> %v10, i16 %x11, i32 11
406 %v12 = insertelement <16 x i16> %v11, i16 %x12, i32 12
407 %v13 = insertelement <16 x i16> %v12, i16 %x13, i32 13
408 %v14 = insertelement <16 x i16> %v13, i16 %x14, i32 14
409 %v15 = insertelement <16 x i16> %v14, i16 %x15, i32 15
417 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
418 ; SSE2-LABEL: @loadext_2i16_to_2i64(
419 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
420 ; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
421 ; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
422 ; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
423 ; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
424 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
425 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
426 ; SSE2-NEXT: ret <2 x i64> [[V1]]
428 ; SLM-LABEL: @loadext_2i16_to_2i64(
429 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
430 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
431 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
432 ; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
433 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
434 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
435 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
436 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
437 ; SLM-NEXT: ret <2 x i64> [[V1]]
439 ; AVX-LABEL: @loadext_2i16_to_2i64(
440 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
441 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
442 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
443 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
444 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
445 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
446 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
447 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
448 ; AVX-NEXT: ret <2 x i64> [[V1]]
450 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
451 %i0 = load i16, i16* %p0, align 1
452 %i1 = load i16, i16* %p1, align 1
453 %x0 = zext i16 %i0 to i64
454 %x1 = zext i16 %i1 to i64
455 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
456 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
460 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
461 ; CHECK-LABEL: @loadext_4i16_to_4i32(
462 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
463 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
464 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
465 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
466 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
467 ; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
468 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
469 ; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
470 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
471 ; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
472 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
473 ; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
474 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
475 ; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
476 ; CHECK-NEXT: ret <4 x i32> [[V3]]
478 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
479 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
480 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
481 %i0 = load i16, i16* %p0, align 1
482 %i1 = load i16, i16* %p1, align 1
483 %i2 = load i16, i16* %p2, align 1
484 %i3 = load i16, i16* %p3, align 1
485 %x0 = zext i16 %i0 to i32
486 %x1 = zext i16 %i1 to i32
487 %x2 = zext i16 %i2 to i32
488 %x3 = zext i16 %i3 to i32
489 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
490 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1
491 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2
492 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3
496 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
497 ; SSE2-LABEL: @loadext_4i16_to_4i64(
498 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
499 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
500 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
501 ; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
502 ; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
503 ; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
504 ; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
505 ; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
506 ; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
507 ; SSE2-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
508 ; SSE2-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
509 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
510 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
511 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
512 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
513 ; SSE2-NEXT: ret <4 x i64> [[V3]]
515 ; SLM-LABEL: @loadext_4i16_to_4i64(
516 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
517 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
518 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
519 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
520 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
521 ; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
522 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
523 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
524 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
525 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
526 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
527 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
528 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
529 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
530 ; SLM-NEXT: ret <4 x i64> [[V3]]
532 ; AVX-LABEL: @loadext_4i16_to_4i64(
533 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
534 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
535 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
536 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
537 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
538 ; AVX-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
539 ; AVX-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
540 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
541 ; AVX-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
542 ; AVX-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
543 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
544 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
545 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
546 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
547 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
548 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
549 ; AVX-NEXT: ret <4 x i64> [[V3]]
551 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
552 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
553 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
554 %i0 = load i16, i16* %p0, align 1
555 %i1 = load i16, i16* %p1, align 1
556 %i2 = load i16, i16* %p2, align 1
557 %i3 = load i16, i16* %p3, align 1
558 %x0 = zext i16 %i0 to i64
559 %x1 = zext i16 %i1 to i64
560 %x2 = zext i16 %i2 to i64
561 %x3 = zext i16 %i3 to i64
562 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
563 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
564 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
565 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3
569 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
570 ; CHECK-LABEL: @loadext_8i16_to_8i32(
571 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
572 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
573 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
574 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
575 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
576 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
577 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
578 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
579 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
580 ; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
581 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
582 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
583 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
584 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
585 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
586 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
587 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
588 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
589 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
590 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
591 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
592 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
593 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
594 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
595 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
596 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
597 ; CHECK-NEXT: ret <8 x i32> [[V7]]
599 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
600 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
601 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
602 %p4 = getelementptr inbounds i16, i16* %p0, i64 4
603 %p5 = getelementptr inbounds i16, i16* %p0, i64 5
604 %p6 = getelementptr inbounds i16, i16* %p0, i64 6
605 %p7 = getelementptr inbounds i16, i16* %p0, i64 7
606 %i0 = load i16, i16* %p0, align 1
607 %i1 = load i16, i16* %p1, align 1
608 %i2 = load i16, i16* %p2, align 1
609 %i3 = load i16, i16* %p3, align 1
610 %i4 = load i16, i16* %p4, align 1
611 %i5 = load i16, i16* %p5, align 1
612 %i6 = load i16, i16* %p6, align 1
613 %i7 = load i16, i16* %p7, align 1
614 %x0 = zext i16 %i0 to i32
615 %x1 = zext i16 %i1 to i32
616 %x2 = zext i16 %i2 to i32
617 %x3 = zext i16 %i3 to i32
618 %x4 = zext i16 %i4 to i32
619 %x5 = zext i16 %i5 to i32
620 %x6 = zext i16 %i6 to i32
621 %x7 = zext i16 %i7 to i32
622 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
623 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1
624 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2
625 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3
626 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4
627 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5
628 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6
629 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7
637 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
638 ; SSE2-LABEL: @loadext_2i32_to_2i64(
639 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
640 ; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
641 ; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
642 ; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
643 ; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
644 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
645 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
646 ; SSE2-NEXT: ret <2 x i64> [[V1]]
648 ; SLM-LABEL: @loadext_2i32_to_2i64(
649 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
650 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
651 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
652 ; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
653 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
654 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
655 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
656 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
657 ; SLM-NEXT: ret <2 x i64> [[V1]]
659 ; AVX-LABEL: @loadext_2i32_to_2i64(
660 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
661 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
662 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
663 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
664 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
665 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
666 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
667 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
668 ; AVX-NEXT: ret <2 x i64> [[V1]]
670 %p1 = getelementptr inbounds i32, i32* %p0, i64 1
671 %i0 = load i32, i32* %p0, align 1
672 %i1 = load i32, i32* %p1, align 1
673 %x0 = zext i32 %i0 to i64
674 %x1 = zext i32 %i1 to i64
675 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
676 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
680 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
681 ; SSE2-LABEL: @loadext_4i32_to_4i64(
682 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
683 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
684 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
685 ; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
686 ; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
687 ; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
688 ; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
689 ; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
690 ; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
691 ; SSE2-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
692 ; SSE2-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
693 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
694 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
695 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
696 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
697 ; SSE2-NEXT: ret <4 x i64> [[V3]]
699 ; SLM-LABEL: @loadext_4i32_to_4i64(
700 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
701 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
702 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
703 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
704 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
705 ; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
706 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
707 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
708 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
709 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
710 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
711 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
712 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
713 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
714 ; SLM-NEXT: ret <4 x i64> [[V3]]
716 ; AVX1-LABEL: @loadext_4i32_to_4i64(
717 ; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
718 ; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
719 ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
720 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
721 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
722 ; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
723 ; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
724 ; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
725 ; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
726 ; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
727 ; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
728 ; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
729 ; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
730 ; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
731 ; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
732 ; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
733 ; AVX1-NEXT: ret <4 x i64> [[V3]]
735 ; AVX2-LABEL: @loadext_4i32_to_4i64(
736 ; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
737 ; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
738 ; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
739 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
740 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
741 ; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
742 ; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
743 ; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
744 ; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
745 ; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
746 ; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
747 ; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
748 ; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
749 ; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
750 ; AVX2-NEXT: ret <4 x i64> [[V3]]
752 ; AVX512-LABEL: @loadext_4i32_to_4i64(
753 ; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
754 ; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
755 ; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
756 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
757 ; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
758 ; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
759 ; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
760 ; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
761 ; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
762 ; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
763 ; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
764 ; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
765 ; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
766 ; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
767 ; AVX512-NEXT: ret <4 x i64> [[V3]]
769 %p1 = getelementptr inbounds i32, i32* %p0, i64 1
770 %p2 = getelementptr inbounds i32, i32* %p0, i64 2
771 %p3 = getelementptr inbounds i32, i32* %p0, i64 3
772 %i0 = load i32, i32* %p0, align 1
773 %i1 = load i32, i32* %p1, align 1
774 %i2 = load i32, i32* %p2, align 1
775 %i3 = load i32, i32* %p3, align 1
776 %x0 = zext i32 %i0 to i64
777 %x1 = zext i32 %i1 to i64
778 %x2 = zext i32 %i2 to i64
779 %x3 = zext i32 %i3 to i64
780 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
781 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
782 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
783 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3