1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
7 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
13 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
14 ; SSE2-LABEL: @loadext_2i8_to_2i64(
15 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
16 ; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
17 ; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
18 ; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64
19 ; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64
20 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
21 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
22 ; SSE2-NEXT: ret <2 x i64> [[V1]]
24 ; SLM-LABEL: @loadext_2i8_to_2i64(
25 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
26 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
27 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
28 ; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
29 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
30 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
31 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
32 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
33 ; SLM-NEXT: ret <2 x i64> [[V1]]
35 ; AVX-LABEL: @loadext_2i8_to_2i64(
36 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
37 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
38 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
39 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
40 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
41 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
42 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
43 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
44 ; AVX-NEXT: ret <2 x i64> [[V1]]
46 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
47 %i0 = load i8, i8* %p0, align 1
48 %i1 = load i8, i8* %p1, align 1
49 %x0 = sext i8 %i0 to i64
50 %x1 = sext i8 %i1 to i64
51 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
52 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
56 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
57 ; SSE2-LABEL: @loadext_4i8_to_4i32(
58 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
59 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
60 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
61 ; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
62 ; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
63 ; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
64 ; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
65 ; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32
66 ; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32
67 ; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32
68 ; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32
69 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
70 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
71 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
72 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
73 ; SSE2-NEXT: ret <4 x i32> [[V3]]
75 ; SLM-LABEL: @loadext_4i8_to_4i32(
76 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
77 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
78 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
79 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
80 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
81 ; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
82 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
83 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
84 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
85 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
86 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
87 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
88 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
89 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
90 ; SLM-NEXT: ret <4 x i32> [[V3]]
92 ; AVX-LABEL: @loadext_4i8_to_4i32(
93 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
94 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
95 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
96 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
97 ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
98 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
99 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
100 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
101 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
102 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
103 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
104 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
105 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
106 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
107 ; AVX-NEXT: ret <4 x i32> [[V3]]
109 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
110 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
111 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
112 %i0 = load i8, i8* %p0, align 1
113 %i1 = load i8, i8* %p1, align 1
114 %i2 = load i8, i8* %p2, align 1
115 %i3 = load i8, i8* %p3, align 1
116 %x0 = sext i8 %i0 to i32
117 %x1 = sext i8 %i1 to i32
118 %x2 = sext i8 %i2 to i32
119 %x3 = sext i8 %i3 to i32
120 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
121 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1
122 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2
123 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3
127 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
128 ; SSE2-LABEL: @loadext_4i8_to_4i64(
129 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
130 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
131 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
132 ; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
133 ; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
134 ; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
135 ; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
136 ; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64
137 ; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64
138 ; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64
139 ; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64
140 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
141 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
142 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
143 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
144 ; SSE2-NEXT: ret <4 x i64> [[V3]]
146 ; SLM-LABEL: @loadext_4i8_to_4i64(
147 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
148 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
149 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
150 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
151 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
152 ; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
153 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
154 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
155 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
156 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
157 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
158 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
159 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
160 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
161 ; SLM-NEXT: ret <4 x i64> [[V3]]
163 ; AVX1-LABEL: @loadext_4i8_to_4i64(
164 ; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
165 ; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
166 ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
167 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
168 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
169 ; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
170 ; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
171 ; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
172 ; AVX1-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64
173 ; AVX1-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64
174 ; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
175 ; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
176 ; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
177 ; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
178 ; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
179 ; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
180 ; AVX1-NEXT: ret <4 x i64> [[V3]]
182 ; AVX2-LABEL: @loadext_4i8_to_4i64(
183 ; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
184 ; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
185 ; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
186 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
187 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
188 ; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
189 ; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
190 ; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
191 ; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
192 ; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
193 ; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
194 ; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
195 ; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
196 ; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
197 ; AVX2-NEXT: ret <4 x i64> [[V3]]
199 ; AVX512-LABEL: @loadext_4i8_to_4i64(
200 ; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
201 ; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
202 ; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
203 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
204 ; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
205 ; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
206 ; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
207 ; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
208 ; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
209 ; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
210 ; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
211 ; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
212 ; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
213 ; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
214 ; AVX512-NEXT: ret <4 x i64> [[V3]]
216 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
217 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
218 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
219 %i0 = load i8, i8* %p0, align 1
220 %i1 = load i8, i8* %p1, align 1
221 %i2 = load i8, i8* %p2, align 1
222 %i3 = load i8, i8* %p3, align 1
223 %x0 = sext i8 %i0 to i64
224 %x1 = sext i8 %i1 to i64
225 %x2 = sext i8 %i2 to i64
226 %x3 = sext i8 %i3 to i64
227 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
228 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
229 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
230 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3
234 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
235 ; CHECK-LABEL: @loadext_8i8_to_8i16(
236 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
237 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
238 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
239 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
240 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
241 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
242 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
243 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
244 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
245 ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
246 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
247 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
248 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
249 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
250 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
251 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
252 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
253 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
254 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
255 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
256 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
257 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
258 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
259 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
260 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
261 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
262 ; CHECK-NEXT: ret <8 x i16> [[V7]]
264 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
265 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
266 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
267 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
268 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
269 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
270 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
271 %i0 = load i8, i8* %p0, align 1
272 %i1 = load i8, i8* %p1, align 1
273 %i2 = load i8, i8* %p2, align 1
274 %i3 = load i8, i8* %p3, align 1
275 %i4 = load i8, i8* %p4, align 1
276 %i5 = load i8, i8* %p5, align 1
277 %i6 = load i8, i8* %p6, align 1
278 %i7 = load i8, i8* %p7, align 1
279 %x0 = sext i8 %i0 to i16
280 %x1 = sext i8 %i1 to i16
281 %x2 = sext i8 %i2 to i16
282 %x3 = sext i8 %i3 to i16
283 %x4 = sext i8 %i4 to i16
284 %x5 = sext i8 %i5 to i16
285 %x6 = sext i8 %i6 to i16
286 %x7 = sext i8 %i7 to i16
287 %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
288 %v1 = insertelement <8 x i16> %v0, i16 %x1, i32 1
289 %v2 = insertelement <8 x i16> %v1, i16 %x2, i32 2
290 %v3 = insertelement <8 x i16> %v2, i16 %x3, i32 3
291 %v4 = insertelement <8 x i16> %v3, i16 %x4, i32 4
292 %v5 = insertelement <8 x i16> %v4, i16 %x5, i32 5
293 %v6 = insertelement <8 x i16> %v5, i16 %x6, i32 6
294 %v7 = insertelement <8 x i16> %v6, i16 %x7, i32 7
298 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
299 ; CHECK-LABEL: @loadext_8i8_to_8i32(
300 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
301 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
302 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
303 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
304 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
305 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
306 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
307 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
308 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
309 ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
310 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
311 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
312 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
313 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
314 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
315 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
316 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
317 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
318 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
319 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
320 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
321 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
322 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
323 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
324 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
325 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
326 ; CHECK-NEXT: ret <8 x i32> [[V7]]
328 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
329 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
330 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
331 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
332 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
333 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
334 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
335 %i0 = load i8, i8* %p0, align 1
336 %i1 = load i8, i8* %p1, align 1
337 %i2 = load i8, i8* %p2, align 1
338 %i3 = load i8, i8* %p3, align 1
339 %i4 = load i8, i8* %p4, align 1
340 %i5 = load i8, i8* %p5, align 1
341 %i6 = load i8, i8* %p6, align 1
342 %i7 = load i8, i8* %p7, align 1
343 %x0 = sext i8 %i0 to i32
344 %x1 = sext i8 %i1 to i32
345 %x2 = sext i8 %i2 to i32
346 %x3 = sext i8 %i3 to i32
347 %x4 = sext i8 %i4 to i32
348 %x5 = sext i8 %i5 to i32
349 %x6 = sext i8 %i6 to i32
350 %x7 = sext i8 %i7 to i32
351 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
352 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1
353 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2
354 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3
355 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4
356 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5
357 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6
358 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7
362 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
363 ; CHECK-LABEL: @loadext_16i8_to_16i16(
364 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
365 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
366 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
367 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
368 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
369 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
370 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
371 ; CHECK-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
372 ; CHECK-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
373 ; CHECK-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
374 ; CHECK-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
375 ; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
376 ; CHECK-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
377 ; CHECK-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
378 ; CHECK-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
379 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
380 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
381 ; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
382 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
383 ; CHECK-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
384 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
385 ; CHECK-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
386 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
387 ; CHECK-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
388 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
389 ; CHECK-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
390 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
391 ; CHECK-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
392 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
393 ; CHECK-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
394 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
395 ; CHECK-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
396 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
397 ; CHECK-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
398 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
399 ; CHECK-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
400 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
401 ; CHECK-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
402 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
403 ; CHECK-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
404 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
405 ; CHECK-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
406 ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
407 ; CHECK-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
408 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
409 ; CHECK-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
410 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
411 ; CHECK-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
412 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
413 ; CHECK-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
414 ; CHECK-NEXT: ret <16 x i16> [[V15]]
416 %p1 = getelementptr inbounds i8, i8* %p0, i64 1
417 %p2 = getelementptr inbounds i8, i8* %p0, i64 2
418 %p3 = getelementptr inbounds i8, i8* %p0, i64 3
419 %p4 = getelementptr inbounds i8, i8* %p0, i64 4
420 %p5 = getelementptr inbounds i8, i8* %p0, i64 5
421 %p6 = getelementptr inbounds i8, i8* %p0, i64 6
422 %p7 = getelementptr inbounds i8, i8* %p0, i64 7
423 %p8 = getelementptr inbounds i8, i8* %p0, i64 8
424 %p9 = getelementptr inbounds i8, i8* %p0, i64 9
425 %p10 = getelementptr inbounds i8, i8* %p0, i64 10
426 %p11 = getelementptr inbounds i8, i8* %p0, i64 11
427 %p12 = getelementptr inbounds i8, i8* %p0, i64 12
428 %p13 = getelementptr inbounds i8, i8* %p0, i64 13
429 %p14 = getelementptr inbounds i8, i8* %p0, i64 14
430 %p15 = getelementptr inbounds i8, i8* %p0, i64 15
431 %i0 = load i8, i8* %p0, align 1
432 %i1 = load i8, i8* %p1, align 1
433 %i2 = load i8, i8* %p2, align 1
434 %i3 = load i8, i8* %p3, align 1
435 %i4 = load i8, i8* %p4, align 1
436 %i5 = load i8, i8* %p5, align 1
437 %i6 = load i8, i8* %p6, align 1
438 %i7 = load i8, i8* %p7, align 1
439 %i8 = load i8, i8* %p8, align 1
440 %i9 = load i8, i8* %p9, align 1
441 %i10 = load i8, i8* %p10, align 1
442 %i11 = load i8, i8* %p11, align 1
443 %i12 = load i8, i8* %p12, align 1
444 %i13 = load i8, i8* %p13, align 1
445 %i14 = load i8, i8* %p14, align 1
446 %i15 = load i8, i8* %p15, align 1
447 %x0 = sext i8 %i0 to i16
448 %x1 = sext i8 %i1 to i16
449 %x2 = sext i8 %i2 to i16
450 %x3 = sext i8 %i3 to i16
451 %x4 = sext i8 %i4 to i16
452 %x5 = sext i8 %i5 to i16
453 %x6 = sext i8 %i6 to i16
454 %x7 = sext i8 %i7 to i16
455 %x8 = sext i8 %i8 to i16
456 %x9 = sext i8 %i9 to i16
457 %x10 = sext i8 %i10 to i16
458 %x11 = sext i8 %i11 to i16
459 %x12 = sext i8 %i12 to i16
460 %x13 = sext i8 %i13 to i16
461 %x14 = sext i8 %i14 to i16
462 %x15 = sext i8 %i15 to i16
463 %v0 = insertelement <16 x i16> undef, i16 %x0, i32 0
464 %v1 = insertelement <16 x i16> %v0, i16 %x1, i32 1
465 %v2 = insertelement <16 x i16> %v1, i16 %x2, i32 2
466 %v3 = insertelement <16 x i16> %v2, i16 %x3, i32 3
467 %v4 = insertelement <16 x i16> %v3, i16 %x4, i32 4
468 %v5 = insertelement <16 x i16> %v4, i16 %x5, i32 5
469 %v6 = insertelement <16 x i16> %v5, i16 %x6, i32 6
470 %v7 = insertelement <16 x i16> %v6, i16 %x7, i32 7
471 %v8 = insertelement <16 x i16> %v7, i16 %x8, i32 8
472 %v9 = insertelement <16 x i16> %v8, i16 %x9, i32 9
473 %v10 = insertelement <16 x i16> %v9, i16 %x10, i32 10
474 %v11 = insertelement <16 x i16> %v10, i16 %x11, i32 11
475 %v12 = insertelement <16 x i16> %v11, i16 %x12, i32 12
476 %v13 = insertelement <16 x i16> %v12, i16 %x13, i32 13
477 %v14 = insertelement <16 x i16> %v13, i16 %x14, i32 14
478 %v15 = insertelement <16 x i16> %v14, i16 %x15, i32 15
486 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
487 ; SSE2-LABEL: @loadext_2i16_to_2i64(
488 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
489 ; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
490 ; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
491 ; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64
492 ; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64
493 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
494 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
495 ; SSE2-NEXT: ret <2 x i64> [[V1]]
497 ; SLM-LABEL: @loadext_2i16_to_2i64(
498 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
499 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
500 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
501 ; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
502 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
503 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
504 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
505 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
506 ; SLM-NEXT: ret <2 x i64> [[V1]]
508 ; AVX-LABEL: @loadext_2i16_to_2i64(
509 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
510 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
511 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
512 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
513 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
514 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
515 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
516 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
517 ; AVX-NEXT: ret <2 x i64> [[V1]]
519 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
520 %i0 = load i16, i16* %p0, align 1
521 %i1 = load i16, i16* %p1, align 1
522 %x0 = sext i16 %i0 to i64
523 %x1 = sext i16 %i1 to i64
524 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
525 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
529 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
530 ; CHECK-LABEL: @loadext_4i16_to_4i32(
531 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
532 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
533 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
534 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
535 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
536 ; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
537 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
538 ; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
539 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
540 ; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
541 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
542 ; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
543 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
544 ; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
545 ; CHECK-NEXT: ret <4 x i32> [[V3]]
547 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
548 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
549 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
550 %i0 = load i16, i16* %p0, align 1
551 %i1 = load i16, i16* %p1, align 1
552 %i2 = load i16, i16* %p2, align 1
553 %i3 = load i16, i16* %p3, align 1
554 %x0 = sext i16 %i0 to i32
555 %x1 = sext i16 %i1 to i32
556 %x2 = sext i16 %i2 to i32
557 %x3 = sext i16 %i3 to i32
558 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
559 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1
560 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2
561 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3
565 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
566 ; SSE2-LABEL: @loadext_4i16_to_4i64(
567 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
568 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
569 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
570 ; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
571 ; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
572 ; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
573 ; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
574 ; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64
575 ; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64
576 ; SSE2-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64
577 ; SSE2-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64
578 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
579 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
580 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
581 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
582 ; SSE2-NEXT: ret <4 x i64> [[V3]]
584 ; SLM-LABEL: @loadext_4i16_to_4i64(
585 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
586 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
587 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
588 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
589 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
590 ; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
591 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
592 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
593 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
594 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
595 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
596 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
597 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
598 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
599 ; SLM-NEXT: ret <4 x i64> [[V3]]
601 ; AVX1-LABEL: @loadext_4i16_to_4i64(
602 ; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
603 ; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
604 ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
605 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
606 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
607 ; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
608 ; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
609 ; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
610 ; AVX1-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64
611 ; AVX1-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64
612 ; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
613 ; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
614 ; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
615 ; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
616 ; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
617 ; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
618 ; AVX1-NEXT: ret <4 x i64> [[V3]]
620 ; AVX2-LABEL: @loadext_4i16_to_4i64(
621 ; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
622 ; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
623 ; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
624 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
625 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
626 ; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
627 ; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
628 ; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
629 ; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
630 ; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
631 ; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
632 ; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
633 ; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
634 ; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
635 ; AVX2-NEXT: ret <4 x i64> [[V3]]
637 ; AVX512-LABEL: @loadext_4i16_to_4i64(
638 ; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
639 ; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
640 ; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
641 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
642 ; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
643 ; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
644 ; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
645 ; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
646 ; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
647 ; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
648 ; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
649 ; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
650 ; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
651 ; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
652 ; AVX512-NEXT: ret <4 x i64> [[V3]]
654 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
655 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
656 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
657 %i0 = load i16, i16* %p0, align 1
658 %i1 = load i16, i16* %p1, align 1
659 %i2 = load i16, i16* %p2, align 1
660 %i3 = load i16, i16* %p3, align 1
661 %x0 = sext i16 %i0 to i64
662 %x1 = sext i16 %i1 to i64
663 %x2 = sext i16 %i2 to i64
664 %x3 = sext i16 %i3 to i64
665 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
666 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
667 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
668 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3
672 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
673 ; CHECK-LABEL: @loadext_8i16_to_8i32(
674 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
675 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
676 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
677 ; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
678 ; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
679 ; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
680 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
681 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
682 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
683 ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
684 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
685 ; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
686 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
687 ; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
688 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
689 ; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
690 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
691 ; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
692 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
693 ; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
694 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
695 ; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
696 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
697 ; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
698 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
699 ; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
700 ; CHECK-NEXT: ret <8 x i32> [[V7]]
702 %p1 = getelementptr inbounds i16, i16* %p0, i64 1
703 %p2 = getelementptr inbounds i16, i16* %p0, i64 2
704 %p3 = getelementptr inbounds i16, i16* %p0, i64 3
705 %p4 = getelementptr inbounds i16, i16* %p0, i64 4
706 %p5 = getelementptr inbounds i16, i16* %p0, i64 5
707 %p6 = getelementptr inbounds i16, i16* %p0, i64 6
708 %p7 = getelementptr inbounds i16, i16* %p0, i64 7
709 %i0 = load i16, i16* %p0, align 1
710 %i1 = load i16, i16* %p1, align 1
711 %i2 = load i16, i16* %p2, align 1
712 %i3 = load i16, i16* %p3, align 1
713 %i4 = load i16, i16* %p4, align 1
714 %i5 = load i16, i16* %p5, align 1
715 %i6 = load i16, i16* %p6, align 1
716 %i7 = load i16, i16* %p7, align 1
717 %x0 = sext i16 %i0 to i32
718 %x1 = sext i16 %i1 to i32
719 %x2 = sext i16 %i2 to i32
720 %x3 = sext i16 %i3 to i32
721 %x4 = sext i16 %i4 to i32
722 %x5 = sext i16 %i5 to i32
723 %x6 = sext i16 %i6 to i32
724 %x7 = sext i16 %i7 to i32
725 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
726 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1
727 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2
728 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3
729 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4
730 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5
731 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6
732 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7
740 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
741 ; SSE2-LABEL: @loadext_2i32_to_2i64(
742 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
743 ; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
744 ; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
745 ; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64
746 ; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64
747 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
748 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
749 ; SSE2-NEXT: ret <2 x i64> [[V1]]
751 ; SLM-LABEL: @loadext_2i32_to_2i64(
752 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
753 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
754 ; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
755 ; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
756 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
757 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
758 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
759 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
760 ; SLM-NEXT: ret <2 x i64> [[V1]]
762 ; AVX-LABEL: @loadext_2i32_to_2i64(
763 ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
764 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
765 ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
766 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
767 ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
768 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
769 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
770 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
771 ; AVX-NEXT: ret <2 x i64> [[V1]]
773 %p1 = getelementptr inbounds i32, i32* %p0, i64 1
774 %i0 = load i32, i32* %p0, align 1
775 %i1 = load i32, i32* %p1, align 1
776 %x0 = sext i32 %i0 to i64
777 %x1 = sext i32 %i1 to i64
778 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
779 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1
783 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
784 ; SSE2-LABEL: @loadext_4i32_to_4i64(
785 ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
786 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
787 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
788 ; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
789 ; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
790 ; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
791 ; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
792 ; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64
793 ; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64
794 ; SSE2-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64
795 ; SSE2-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64
796 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
797 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
798 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
799 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
800 ; SSE2-NEXT: ret <4 x i64> [[V3]]
802 ; SLM-LABEL: @loadext_4i32_to_4i64(
803 ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
804 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
805 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
806 ; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
807 ; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
808 ; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
809 ; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
810 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
811 ; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
812 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
813 ; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
814 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
815 ; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
816 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
817 ; SLM-NEXT: ret <4 x i64> [[V3]]
819 ; AVX1-LABEL: @loadext_4i32_to_4i64(
820 ; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
821 ; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
822 ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
823 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
824 ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
825 ; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
826 ; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
827 ; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
828 ; AVX1-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64
829 ; AVX1-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64
830 ; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
831 ; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
832 ; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
833 ; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
834 ; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
835 ; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
836 ; AVX1-NEXT: ret <4 x i64> [[V3]]
838 ; AVX2-LABEL: @loadext_4i32_to_4i64(
839 ; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
840 ; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
841 ; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
842 ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
843 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
844 ; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
845 ; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
846 ; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
847 ; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
848 ; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
849 ; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
850 ; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
851 ; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
852 ; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
853 ; AVX2-NEXT: ret <4 x i64> [[V3]]
855 ; AVX512-LABEL: @loadext_4i32_to_4i64(
856 ; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
857 ; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
858 ; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
859 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
860 ; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
861 ; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
862 ; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
863 ; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
864 ; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
865 ; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
866 ; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
867 ; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
868 ; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
869 ; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
870 ; AVX512-NEXT: ret <4 x i64> [[V3]]
872 %p1 = getelementptr inbounds i32, i32* %p0, i64 1
873 %p2 = getelementptr inbounds i32, i32* %p0, i64 2
874 %p3 = getelementptr inbounds i32, i32* %p0, i64 3
875 %i0 = load i32, i32* %p0, align 1
876 %i1 = load i32, i32* %p1, align 1
877 %i2 = load i32, i32* %p2, align 1
878 %i3 = load i32, i32* %p3, align 1
879 %x0 = sext i32 %i0 to i64
880 %x1 = sext i32 %i1 to i64
881 %x2 = sext i32 %i2 to i64
882 %x3 = sext i32 %i3 to i64
883 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
884 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1
885 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2
886 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3