1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
7 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
13 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
14 ; SSE2-LABEL: @loadext_2i8_to_2i64(
15 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
16 ; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
17 ; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
18 ; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
19 ; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
20 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
21 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
22 ; SSE2-NEXT: ret <2 x i64> [[V1]]
24 ; SLM-LABEL: @loadext_2i8_to_2i64(
25 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
26 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
27 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
28 ; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
29 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
30 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
31 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
32 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
33 ; SLM-NEXT: ret <2 x i64> [[V1]]
35 ; AVX-LABEL: @loadext_2i8_to_2i64(
36 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
37 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
38 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
39 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
40 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
41 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
42 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
43 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
44 ; AVX-NEXT: ret <2 x i64> [[V1]]
46 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
47 %i0 = load i8, i8* %p0, align 1
48 %i1 = load i8, i8* %p1, align 1
49 %x0 = zext i8 %i0 to i64
50 %x1 = zext i8 %i1 to i64
51 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
52 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
56 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
57 ; CHECK-LABEL: @loadext_4i8_to_4i32(
58 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
59 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
60 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
61 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
62 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
63 ; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
64 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
65 ; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
66 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
67 ; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
68 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
69 ; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
70 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
71 ; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
72 ; CHECK-NEXT: ret <4 x i32> [[V3]]
74 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
75 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
76 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
77 %i0 = load i8, i8* %p0, align 1
78 %i1 = load i8, i8* %p1, align 1
79 %i2 = load i8, i8* %p2, align 1
80 %i3 = load i8, i8* %p3, align 1
81 %x0 = zext i8 %i0 to i32
82 %x1 = zext i8 %i1 to i32
83 %x2 = zext i8 %i2 to i32
84 %x3 = zext i8 %i3 to i32
85 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
86 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1
87 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2
88 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3
92 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
93 ; SSE2-LABEL: @loadext_4i8_to_4i64(
94 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
95 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
96 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
97 ; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
98 ; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
99 ; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
100 ; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
101 ; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
102 ; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
103 ; SSE2-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
104 ; SSE2-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
105 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
106 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
107 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
108 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
109 ; SSE2-NEXT: ret <4 x i64> [[V3]]
111 ; SLM-LABEL: @loadext_4i8_to_4i64(
112 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
113 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
114 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
115 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
116 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
117 ; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
118 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
119 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
120 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
121 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
122 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
123 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
124 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
125 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
126 ; SLM-NEXT: ret <4 x i64> [[V3]]
128 ; AVX1-LABEL: @loadext_4i8_to_4i64(
129 ; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
130 ; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
131 ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
132 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
133 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
134 ; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
135 ; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
136 ; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
137 ; AVX1-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
138 ; AVX1-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
139 ; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
140 ; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
141 ; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
142 ; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
143 ; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
144 ; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
145 ; AVX1-NEXT: ret <4 x i64> [[V3]]
147 ; AVX2-LABEL: @loadext_4i8_to_4i64(
148 ; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
149 ; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
150 ; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
151 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
152 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
153 ; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
154 ; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
155 ; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
156 ; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
157 ; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
158 ; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
159 ; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
160 ; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
161 ; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
162 ; AVX2-NEXT: ret <4 x i64> [[V3]]
164 ; AVX512-LABEL: @loadext_4i8_to_4i64(
165 ; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
166 ; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
167 ; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
168 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
169 ; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
170 ; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
171 ; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
172 ; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
173 ; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
174 ; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
175 ; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
176 ; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
177 ; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
178 ; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
179 ; AVX512-NEXT: ret <4 x i64> [[V3]]
181 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
182 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
183 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
184 %i0 = load i8, i8* %p0, align 1
185 %i1 = load i8, i8* %p1, align 1
186 %i2 = load i8, i8* %p2, align 1
187 %i3 = load i8, i8* %p3, align 1
188 %x0 = zext i8 %i0 to i64
189 %x1 = zext i8 %i1 to i64
190 %x2 = zext i8 %i2 to i64
191 %x3 = zext i8 %i3 to i64
192 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
193 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
194 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
195 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3
199 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
200 ; CHECK-LABEL: @loadext_8i8_to_8i16(
201 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
202 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
203 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
204 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
205 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
206 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
207 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
208 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
209 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
210 ; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
211 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
212 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
213 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
214 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
215 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
216 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
217 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
218 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
219 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
220 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
221 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
222 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
223 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
224 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
225 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
226 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
227 ; CHECK-NEXT: ret <8 x i16> [[V7]]
229 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
230 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
231 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
232 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
233 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
234 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
235 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
236 %i0 = load i8, i8* %p0, align 1
237 %i1 = load i8, i8* %p1, align 1
238 %i2 = load i8, i8* %p2, align 1
239 %i3 = load i8, i8* %p3, align 1
240 %i4 = load i8, i8* %p4, align 1
241 %i5 = load i8, i8* %p5, align 1
242 %i6 = load i8, i8* %p6, align 1
243 %i7 = load i8, i8* %p7, align 1
244 %x0 = zext i8 %i0 to i16
245 %x1 = zext i8 %i1 to i16
246 %x2 = zext i8 %i2 to i16
247 %x3 = zext i8 %i3 to i16
248 %x4 = zext i8 %i4 to i16
249 %x5 = zext i8 %i5 to i16
250 %x6 = zext i8 %i6 to i16
251 %x7 = zext i8 %i7 to i16
252 %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
253 %v1 = insertelement <8 x i16> %v0, i16 %x1, i32 1
254 %v2 = insertelement <8 x i16> %v1, i16 %x2, i32 2
255 %v3 = insertelement <8 x i16> %v2, i16 %x3, i32 3
256 %v4 = insertelement <8 x i16> %v3, i16 %x4, i32 4
257 %v5 = insertelement <8 x i16> %v4, i16 %x5, i32 5
258 %v6 = insertelement <8 x i16> %v5, i16 %x6, i32 6
259 %v7 = insertelement <8 x i16> %v6, i16 %x7, i32 7
263 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
264 ; CHECK-LABEL: @loadext_8i8_to_8i32(
265 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
266 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
267 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
268 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
269 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
270 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
271 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
272 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
273 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
274 ; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
275 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
276 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
277 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
278 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
279 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
280 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
281 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
282 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
283 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
284 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
285 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
286 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
287 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
288 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
289 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
290 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
291 ; CHECK-NEXT: ret <8 x i32> [[V7]]
293 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
294 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
295 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
296 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
297 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
298 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
299 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
300 %i0 = load i8, i8* %p0, align 1
301 %i1 = load i8, i8* %p1, align 1
302 %i2 = load i8, i8* %p2, align 1
303 %i3 = load i8, i8* %p3, align 1
304 %i4 = load i8, i8* %p4, align 1
305 %i5 = load i8, i8* %p5, align 1
306 %i6 = load i8, i8* %p6, align 1
307 %i7 = load i8, i8* %p7, align 1
308 %x0 = zext i8 %i0 to i32
309 %x1 = zext i8 %i1 to i32
310 %x2 = zext i8 %i2 to i32
311 %x3 = zext i8 %i3 to i32
312 %x4 = zext i8 %i4 to i32
313 %x5 = zext i8 %i5 to i32
314 %x6 = zext i8 %i6 to i32
315 %x7 = zext i8 %i7 to i32
316 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
317 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1
318 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2
319 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3
320 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4
321 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5
322 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6
323 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7
327 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
328 ; CHECK-LABEL: @loadext_16i8_to_16i16(
329 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
330 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
331 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
332 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
333 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
334 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
335 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
336 ; CHECK-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
337 ; CHECK-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
338 ; CHECK-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
339 ; CHECK-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
340 ; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
341 ; CHECK-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
342 ; CHECK-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
343 ; CHECK-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
344 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
345 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
346 ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
347 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
348 ; CHECK-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
349 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
350 ; CHECK-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
351 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
352 ; CHECK-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
353 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
354 ; CHECK-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
355 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
356 ; CHECK-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
357 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
358 ; CHECK-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
359 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
360 ; CHECK-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
361 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
362 ; CHECK-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
363 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
364 ; CHECK-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
365 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
366 ; CHECK-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
367 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
368 ; CHECK-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
369 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
370 ; CHECK-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
371 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
372 ; CHECK-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
373 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
374 ; CHECK-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
375 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
376 ; CHECK-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
377 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
378 ; CHECK-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
379 ; CHECK-NEXT: ret <16 x i16> [[V15]]
381 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
382 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
383 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
384 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
385 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
386 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
387 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
388 %p8 = getelementptr inbounds i8, i8* %p0, i64 8
389 %p9 = getelementptr inbounds i8, i8* %p0, i64 9
390 %p10 = getelementptr inbounds i8, i8* %p0, i64 10
391 %p11 = getelementptr inbounds i8, i8* %p0, i64 11
392 %p12 = getelementptr inbounds i8, i8* %p0, i64 12
393 %p13 = getelementptr inbounds i8, i8* %p0, i64 13
394 %p14 = getelementptr inbounds i8, i8* %p0, i64 14
395 %p15 = getelementptr inbounds i8, i8* %p0, i64 15
396 %i0 = load i8, i8* %p0, align 1
397 %i1 = load i8, i8* %p1, align 1
398 %i2 = load i8, i8* %p2, align 1
399 %i3 = load i8, i8* %p3, align 1
400 %i4 = load i8, i8* %p4, align 1
401 %i5 = load i8, i8* %p5, align 1
402 %i6 = load i8, i8* %p6, align 1
403 %i7 = load i8, i8* %p7, align 1
404 %i8 = load i8, i8* %p8, align 1
405 %i9 = load i8, i8* %p9, align 1
406 %i10 = load i8, i8* %p10, align 1
407 %i11 = load i8, i8* %p11, align 1
408 %i12 = load i8, i8* %p12, align 1
409 %i13 = load i8, i8* %p13, align 1
410 %i14 = load i8, i8* %p14, align 1
411 %i15 = load i8, i8* %p15, align 1
412 %x0 = zext i8 %i0 to i16
413 %x1 = zext i8 %i1 to i16
414 %x2 = zext i8 %i2 to i16
415 %x3 = zext i8 %i3 to i16
416 %x4 = zext i8 %i4 to i16
417 %x5 = zext i8 %i5 to i16
418 %x6 = zext i8 %i6 to i16
419 %x7 = zext i8 %i7 to i16
420 %x8 = zext i8 %i8 to i16
421 %x9 = zext i8 %i9 to i16
422 %x10 = zext i8 %i10 to i16
423 %x11 = zext i8 %i11 to i16
424 %x12 = zext i8 %i12 to i16
425 %x13 = zext i8 %i13 to i16
426 %x14 = zext i8 %i14 to i16
427 %x15 = zext i8 %i15 to i16
428 %v0 = insertelement <16 x i16> undef, i16 %x0, i32 0
429 %v1 = insertelement <16 x i16> %v0, i16 %x1, i32 1
430 %v2 = insertelement <16 x i16> %v1, i16 %x2, i32 2
431 %v3 = insertelement <16 x i16> %v2, i16 %x3, i32 3
432 %v4 = insertelement <16 x i16> %v3, i16 %x4, i32 4
433 %v5 = insertelement <16 x i16> %v4, i16 %x5, i32 5
434 %v6 = insertelement <16 x i16> %v5, i16 %x6, i32 6
435 %v7 = insertelement <16 x i16> %v6, i16 %x7, i32 7
436 %v8 = insertelement <16 x i16> %v7, i16 %x8, i32 8
437 %v9 = insertelement <16 x i16> %v8, i16 %x9, i32 9
438 %v10 = insertelement <16 x i16> %v9, i16 %x10, i32 10
439 %v11 = insertelement <16 x i16> %v10, i16 %x11, i32 11
440 %v12 = insertelement <16 x i16> %v11, i16 %x12, i32 12
441 %v13 = insertelement <16 x i16> %v12, i16 %x13, i32 13
442 %v14 = insertelement <16 x i16> %v13, i16 %x14, i32 14
443 %v15 = insertelement <16 x i16> %v14, i16 %x15, i32 15
451 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
452 ; SSE2-LABEL: @loadext_2i16_to_2i64(
453 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
454 ; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
455 ; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
456 ; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
457 ; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
458 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
459 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
460 ; SSE2-NEXT: ret <2 x i64> [[V1]]
462 ; SLM-LABEL: @loadext_2i16_to_2i64(
463 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
464 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
465 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
466 ; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
467 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
468 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
469 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
470 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
471 ; SLM-NEXT: ret <2 x i64> [[V1]]
473 ; AVX-LABEL: @loadext_2i16_to_2i64(
474 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
475 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
476 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
477 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
478 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
479 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
480 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
481 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
482 ; AVX-NEXT: ret <2 x i64> [[V1]]
484 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
485 %i0 = load i16, i16* %p0, align 1
486 %i1 = load i16, i16* %p1, align 1
487 %x0 = zext i16 %i0 to i64
488 %x1 = zext i16 %i1 to i64
489 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
490 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
494 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
495 ; CHECK-LABEL: @loadext_4i16_to_4i32(
496 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
497 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
498 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
499 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
500 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
501 ; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
502 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
503 ; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
504 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
505 ; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
506 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
507 ; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
508 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
509 ; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
510 ; CHECK-NEXT: ret <4 x i32> [[V3]]
512 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
513 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
514 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
515 %i0 = load i16, i16* %p0, align 1
516 %i1 = load i16, i16* %p1, align 1
517 %i2 = load i16, i16* %p2, align 1
518 %i3 = load i16, i16* %p3, align 1
519 %x0 = zext i16 %i0 to i32
520 %x1 = zext i16 %i1 to i32
521 %x2 = zext i16 %i2 to i32
522 %x3 = zext i16 %i3 to i32
523 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
524 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1
525 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2
526 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3
530 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
531 ; SSE2-LABEL: @loadext_4i16_to_4i64(
532 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
533 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
534 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
535 ; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
536 ; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
537 ; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
538 ; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
539 ; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
540 ; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
541 ; SSE2-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
542 ; SSE2-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
543 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
544 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
545 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
546 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
547 ; SSE2-NEXT: ret <4 x i64> [[V3]]
549 ; SLM-LABEL: @loadext_4i16_to_4i64(
550 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
551 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
552 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
553 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
554 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
555 ; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
556 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
557 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
558 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
559 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
560 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
561 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
562 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
563 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
564 ; SLM-NEXT: ret <4 x i64> [[V3]]
566 ; AVX1-LABEL: @loadext_4i16_to_4i64(
567 ; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
568 ; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
569 ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
570 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
571 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
572 ; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
573 ; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
574 ; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
575 ; AVX1-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
576 ; AVX1-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
577 ; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
578 ; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
579 ; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
580 ; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
581 ; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
582 ; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
583 ; AVX1-NEXT: ret <4 x i64> [[V3]]
585 ; AVX2-LABEL: @loadext_4i16_to_4i64(
586 ; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
587 ; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
588 ; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
589 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
590 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
591 ; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
592 ; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
593 ; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
594 ; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
595 ; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
596 ; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
597 ; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
598 ; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
599 ; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
600 ; AVX2-NEXT: ret <4 x i64> [[V3]]
602 ; AVX512-LABEL: @loadext_4i16_to_4i64(
603 ; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
604 ; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
605 ; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
606 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
607 ; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
608 ; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
609 ; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
610 ; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
611 ; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
612 ; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
613 ; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
614 ; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
615 ; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
616 ; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
617 ; AVX512-NEXT: ret <4 x i64> [[V3]]
619 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
620 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
621 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
622 %i0 = load i16, i16* %p0, align 1
623 %i1 = load i16, i16* %p1, align 1
624 %i2 = load i16, i16* %p2, align 1
625 %i3 = load i16, i16* %p3, align 1
626 %x0 = zext i16 %i0 to i64
627 %x1 = zext i16 %i1 to i64
628 %x2 = zext i16 %i2 to i64
629 %x3 = zext i16 %i3 to i64
630 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
631 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
632 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
633 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3
637 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
638 ; CHECK-LABEL: @loadext_8i16_to_8i32(
639 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
640 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
641 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
642 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
643 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
644 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
645 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
646 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
647 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
648 ; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
649 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
650 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
651 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
652 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
653 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
654 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
655 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
656 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
657 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
658 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
659 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
660 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
661 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
662 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
663 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
664 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
665 ; CHECK-NEXT: ret <8 x i32> [[V7]]
667 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
668 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
669 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
670 %p4 = getelementptr inbounds i16, i16* %p0, i64 4
671 %p5 = getelementptr inbounds i16, i16* %p0, i64 5
672 %p6 = getelementptr inbounds i16, i16* %p0, i64 6
673 %p7 = getelementptr inbounds i16, i16* %p0, i64 7
674 %i0 = load i16, i16* %p0, align 1
675 %i1 = load i16, i16* %p1, align 1
676 %i2 = load i16, i16* %p2, align 1
677 %i3 = load i16, i16* %p3, align 1
678 %i4 = load i16, i16* %p4, align 1
679 %i5 = load i16, i16* %p5, align 1
680 %i6 = load i16, i16* %p6, align 1
681 %i7 = load i16, i16* %p7, align 1
682 %x0 = zext i16 %i0 to i32
683 %x1 = zext i16 %i1 to i32
684 %x2 = zext i16 %i2 to i32
685 %x3 = zext i16 %i3 to i32
686 %x4 = zext i16 %i4 to i32
687 %x5 = zext i16 %i5 to i32
688 %x6 = zext i16 %i6 to i32
689 %x7 = zext i16 %i7 to i32
690 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
691 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1
692 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2
693 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3
694 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4
695 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5
696 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6
697 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7
705 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
706 ; SSE2-LABEL: @loadext_2i32_to_2i64(
707 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
708 ; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
709 ; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
710 ; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
711 ; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
712 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
713 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
714 ; SSE2-NEXT: ret <2 x i64> [[V1]]
716 ; SLM-LABEL: @loadext_2i32_to_2i64(
717 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
718 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
719 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
720 ; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
721 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
722 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
723 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
724 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
725 ; SLM-NEXT: ret <2 x i64> [[V1]]
727 ; AVX-LABEL: @loadext_2i32_to_2i64(
728 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
729 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
730 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
731 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
732 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
733 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
734 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
735 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
736 ; AVX-NEXT: ret <2 x i64> [[V1]]
738 %p1 = getelementptr inbounds i32, i32* %p0, i64 1
739 %i0 = load i32, i32* %p0, align 1
740 %i1 = load i32, i32* %p1, align 1
741 %x0 = zext i32 %i0 to i64
742 %x1 = zext i32 %i1 to i64
743 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
744 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
748 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
749 ; SSE2-LABEL: @loadext_4i32_to_4i64(
750 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
751 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
752 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
753 ; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
754 ; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
755 ; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
756 ; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
757 ; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
758 ; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
759 ; SSE2-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
760 ; SSE2-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
761 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
762 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
763 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
764 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
765 ; SSE2-NEXT: ret <4 x i64> [[V3]]
767 ; SLM-LABEL: @loadext_4i32_to_4i64(
768 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
769 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
770 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
771 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
772 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
773 ; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
774 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
775 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
776 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
777 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
778 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
779 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
780 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
781 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
782 ; SLM-NEXT: ret <4 x i64> [[V3]]
784 ; AVX1-LABEL: @loadext_4i32_to_4i64(
785 ; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
786 ; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
787 ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
788 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
789 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
790 ; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
791 ; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
792 ; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
793 ; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
794 ; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
795 ; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
796 ; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
797 ; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
798 ; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
799 ; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
800 ; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
801 ; AVX1-NEXT: ret <4 x i64> [[V3]]
803 ; AVX2-LABEL: @loadext_4i32_to_4i64(
804 ; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
805 ; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
806 ; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
807 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
808 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
809 ; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
810 ; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
811 ; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
812 ; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
813 ; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
814 ; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
815 ; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
816 ; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
817 ; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
818 ; AVX2-NEXT: ret <4 x i64> [[V3]]
820 ; AVX512-LABEL: @loadext_4i32_to_4i64(
821 ; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
822 ; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
823 ; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
824 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
825 ; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
826 ; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
827 ; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
828 ; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
829 ; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
830 ; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
831 ; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
832 ; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
833 ; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
834 ; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
835 ; AVX512-NEXT: ret <4 x i64> [[V3]]
837 %p1 = getelementptr inbounds i32, i32* %p0, i64 1
838 %p2 = getelementptr inbounds i32, i32* %p0, i64 2
839 %p3 = getelementptr inbounds i32, i32* %p0, i64 3
840 %i0 = load i32, i32* %p0, align 1
841 %i1 = load i32, i32* %p1, align 1
842 %i2 = load i32, i32* %p2, align 1
843 %i3 = load i32, i32* %p3, align 1
844 %x0 = zext i32 %i0 to i64
845 %x1 = zext i32 %i1 to i64
846 %x2 = zext i32 %i2 to i64
847 %x3 = zext i32 %i3 to i64
848 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
849 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
850 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
851 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3