1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
5 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
6 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
7 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
8 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
14 define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
15 ; SSE-LABEL: load_v1f64_v1i64:
17 ; SSE-NEXT: testq %rdi, %rdi
18 ; SSE-NEXT: jne LBB0_2
19 ; SSE-NEXT: ## %bb.1: ## %cond.load
20 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
21 ; SSE-NEXT: LBB0_2: ## %else
24 ; AVX-LABEL: load_v1f64_v1i64:
26 ; AVX-NEXT: testq %rdi, %rdi
27 ; AVX-NEXT: jne LBB0_2
28 ; AVX-NEXT: ## %bb.1: ## %cond.load
29 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
30 ; AVX-NEXT: LBB0_2: ## %else
32 %mask = icmp eq <1 x i64> %trigger, zeroinitializer
33 %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1> %mask, <1 x double> %dst)
37 define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
38 ; SSE2-LABEL: load_v2f64_v2i64:
40 ; SSE2-NEXT: pxor %xmm2, %xmm2
41 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
42 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
43 ; SSE2-NEXT: pand %xmm2, %xmm0
44 ; SSE2-NEXT: movmskpd %xmm0, %eax
45 ; SSE2-NEXT: testb $1, %al
46 ; SSE2-NEXT: jne LBB1_1
47 ; SSE2-NEXT: ## %bb.2: ## %else
48 ; SSE2-NEXT: testb $2, %al
49 ; SSE2-NEXT: jne LBB1_3
50 ; SSE2-NEXT: LBB1_4: ## %else2
51 ; SSE2-NEXT: movaps %xmm1, %xmm0
53 ; SSE2-NEXT: LBB1_1: ## %cond.load
54 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
55 ; SSE2-NEXT: testb $2, %al
56 ; SSE2-NEXT: je LBB1_4
57 ; SSE2-NEXT: LBB1_3: ## %cond.load1
58 ; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
59 ; SSE2-NEXT: movaps %xmm1, %xmm0
62 ; SSE42-LABEL: load_v2f64_v2i64:
64 ; SSE42-NEXT: pxor %xmm2, %xmm2
65 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2
66 ; SSE42-NEXT: movmskpd %xmm2, %eax
67 ; SSE42-NEXT: testb $1, %al
68 ; SSE42-NEXT: jne LBB1_1
69 ; SSE42-NEXT: ## %bb.2: ## %else
70 ; SSE42-NEXT: testb $2, %al
71 ; SSE42-NEXT: jne LBB1_3
72 ; SSE42-NEXT: LBB1_4: ## %else2
73 ; SSE42-NEXT: movaps %xmm1, %xmm0
75 ; SSE42-NEXT: LBB1_1: ## %cond.load
76 ; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
77 ; SSE42-NEXT: testb $2, %al
78 ; SSE42-NEXT: je LBB1_4
79 ; SSE42-NEXT: LBB1_3: ## %cond.load1
80 ; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
81 ; SSE42-NEXT: movaps %xmm1, %xmm0
84 ; AVX1OR2-LABEL: load_v2f64_v2i64:
86 ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
87 ; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
88 ; AVX1OR2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
89 ; AVX1OR2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
92 ; AVX512F-LABEL: load_v2f64_v2i64:
94 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
95 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
96 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
97 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
98 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
99 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
100 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
101 ; AVX512F-NEXT: vzeroupper
104 ; AVX512VL-LABEL: load_v2f64_v2i64:
105 ; AVX512VL: ## %bb.0:
106 ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
107 ; AVX512VL-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
108 ; AVX512VL-NEXT: retq
109 %mask = icmp eq <2 x i64> %trigger, zeroinitializer
110 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
111 ret <2 x double> %res
114 define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
115 ; SSE-LABEL: load_v4f64_v4i32:
117 ; SSE-NEXT: pxor %xmm3, %xmm3
118 ; SSE-NEXT: pcmpeqd %xmm0, %xmm3
119 ; SSE-NEXT: movmskps %xmm3, %eax
120 ; SSE-NEXT: testb $1, %al
121 ; SSE-NEXT: jne LBB2_1
122 ; SSE-NEXT: ## %bb.2: ## %else
123 ; SSE-NEXT: testb $2, %al
124 ; SSE-NEXT: jne LBB2_3
125 ; SSE-NEXT: LBB2_4: ## %else2
126 ; SSE-NEXT: testb $4, %al
127 ; SSE-NEXT: jne LBB2_5
128 ; SSE-NEXT: LBB2_6: ## %else5
129 ; SSE-NEXT: testb $8, %al
130 ; SSE-NEXT: je LBB2_8
131 ; SSE-NEXT: LBB2_7: ## %cond.load7
132 ; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
133 ; SSE-NEXT: LBB2_8: ## %else8
134 ; SSE-NEXT: movaps %xmm1, %xmm0
135 ; SSE-NEXT: movaps %xmm2, %xmm1
137 ; SSE-NEXT: LBB2_1: ## %cond.load
138 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
139 ; SSE-NEXT: testb $2, %al
140 ; SSE-NEXT: je LBB2_4
141 ; SSE-NEXT: LBB2_3: ## %cond.load1
142 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
143 ; SSE-NEXT: testb $4, %al
144 ; SSE-NEXT: je LBB2_6
145 ; SSE-NEXT: LBB2_5: ## %cond.load4
146 ; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
147 ; SSE-NEXT: testb $8, %al
148 ; SSE-NEXT: jne LBB2_7
149 ; SSE-NEXT: jmp LBB2_8
151 ; AVX1-LABEL: load_v4f64_v4i32:
153 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
154 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
155 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
156 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
157 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
158 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
159 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
160 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
163 ; AVX2-LABEL: load_v4f64_v4i32:
165 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
166 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
167 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
168 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
169 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
172 ; AVX512F-LABEL: load_v4f64_v4i32:
174 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
175 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
176 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
177 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
178 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
179 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
180 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
183 ; AVX512VL-LABEL: load_v4f64_v4i32:
184 ; AVX512VL: ## %bb.0:
185 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
186 ; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
187 ; AVX512VL-NEXT: retq
188 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
189 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double> %dst)
190 ret <4 x double> %res
193 define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %addr) {
194 ; SSE-LABEL: load_v4f64_v4i32_zero:
196 ; SSE-NEXT: movdqa %xmm0, %xmm1
197 ; SSE-NEXT: pxor %xmm0, %xmm0
198 ; SSE-NEXT: pcmpeqd %xmm0, %xmm1
199 ; SSE-NEXT: movmskps %xmm1, %eax
200 ; SSE-NEXT: testb $1, %al
201 ; SSE-NEXT: xorps %xmm1, %xmm1
202 ; SSE-NEXT: jne LBB3_1
203 ; SSE-NEXT: ## %bb.2: ## %else
204 ; SSE-NEXT: testb $2, %al
205 ; SSE-NEXT: jne LBB3_3
206 ; SSE-NEXT: LBB3_4: ## %else2
207 ; SSE-NEXT: testb $4, %al
208 ; SSE-NEXT: jne LBB3_5
209 ; SSE-NEXT: LBB3_6: ## %else5
210 ; SSE-NEXT: testb $8, %al
211 ; SSE-NEXT: jne LBB3_7
212 ; SSE-NEXT: LBB3_8: ## %else8
214 ; SSE-NEXT: LBB3_1: ## %cond.load
215 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
216 ; SSE-NEXT: xorps %xmm1, %xmm1
217 ; SSE-NEXT: testb $2, %al
218 ; SSE-NEXT: je LBB3_4
219 ; SSE-NEXT: LBB3_3: ## %cond.load1
220 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
221 ; SSE-NEXT: testb $4, %al
222 ; SSE-NEXT: je LBB3_6
223 ; SSE-NEXT: LBB3_5: ## %cond.load4
224 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
225 ; SSE-NEXT: testb $8, %al
226 ; SSE-NEXT: je LBB3_8
227 ; SSE-NEXT: LBB3_7: ## %cond.load7
228 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
231 ; AVX1-LABEL: load_v4f64_v4i32_zero:
233 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
234 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
235 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
236 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
237 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
238 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
239 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
242 ; AVX2-LABEL: load_v4f64_v4i32_zero:
244 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
245 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
246 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
247 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
250 ; AVX512F-LABEL: load_v4f64_v4i32_zero:
252 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
253 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
254 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
255 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
256 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
257 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
260 ; AVX512VL-LABEL: load_v4f64_v4i32_zero:
261 ; AVX512VL: ## %bb.0:
262 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
263 ; AVX512VL-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
264 ; AVX512VL-NEXT: retq
265 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
266 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer)
267 ret <4 x double> %res
270 define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x double> %dst) {
271 ; SSE2-LABEL: load_v4f64_v4i64:
273 ; SSE2-NEXT: pxor %xmm4, %xmm4
274 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
275 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2]
276 ; SSE2-NEXT: pand %xmm1, %xmm5
277 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
278 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
279 ; SSE2-NEXT: pand %xmm0, %xmm1
280 ; SSE2-NEXT: packssdw %xmm5, %xmm1
281 ; SSE2-NEXT: movmskps %xmm1, %eax
282 ; SSE2-NEXT: testb $1, %al
283 ; SSE2-NEXT: jne LBB4_1
284 ; SSE2-NEXT: ## %bb.2: ## %else
285 ; SSE2-NEXT: testb $2, %al
286 ; SSE2-NEXT: jne LBB4_3
287 ; SSE2-NEXT: LBB4_4: ## %else2
288 ; SSE2-NEXT: testb $4, %al
289 ; SSE2-NEXT: jne LBB4_5
290 ; SSE2-NEXT: LBB4_6: ## %else5
291 ; SSE2-NEXT: testb $8, %al
292 ; SSE2-NEXT: je LBB4_8
293 ; SSE2-NEXT: LBB4_7: ## %cond.load7
294 ; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
295 ; SSE2-NEXT: LBB4_8: ## %else8
296 ; SSE2-NEXT: movaps %xmm2, %xmm0
297 ; SSE2-NEXT: movaps %xmm3, %xmm1
299 ; SSE2-NEXT: LBB4_1: ## %cond.load
300 ; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
301 ; SSE2-NEXT: testb $2, %al
302 ; SSE2-NEXT: je LBB4_4
303 ; SSE2-NEXT: LBB4_3: ## %cond.load1
304 ; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
305 ; SSE2-NEXT: testb $4, %al
306 ; SSE2-NEXT: je LBB4_6
307 ; SSE2-NEXT: LBB4_5: ## %cond.load4
308 ; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
309 ; SSE2-NEXT: testb $8, %al
310 ; SSE2-NEXT: jne LBB4_7
311 ; SSE2-NEXT: jmp LBB4_8
313 ; SSE42-LABEL: load_v4f64_v4i64:
315 ; SSE42-NEXT: pxor %xmm4, %xmm4
316 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm1
317 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm0
318 ; SSE42-NEXT: packssdw %xmm1, %xmm0
319 ; SSE42-NEXT: movmskps %xmm0, %eax
320 ; SSE42-NEXT: testb $1, %al
321 ; SSE42-NEXT: jne LBB4_1
322 ; SSE42-NEXT: ## %bb.2: ## %else
323 ; SSE42-NEXT: testb $2, %al
324 ; SSE42-NEXT: jne LBB4_3
325 ; SSE42-NEXT: LBB4_4: ## %else2
326 ; SSE42-NEXT: testb $4, %al
327 ; SSE42-NEXT: jne LBB4_5
328 ; SSE42-NEXT: LBB4_6: ## %else5
329 ; SSE42-NEXT: testb $8, %al
330 ; SSE42-NEXT: je LBB4_8
331 ; SSE42-NEXT: LBB4_7: ## %cond.load7
332 ; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
333 ; SSE42-NEXT: LBB4_8: ## %else8
334 ; SSE42-NEXT: movaps %xmm2, %xmm0
335 ; SSE42-NEXT: movaps %xmm3, %xmm1
337 ; SSE42-NEXT: LBB4_1: ## %cond.load
338 ; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
339 ; SSE42-NEXT: testb $2, %al
340 ; SSE42-NEXT: je LBB4_4
341 ; SSE42-NEXT: LBB4_3: ## %cond.load1
342 ; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
343 ; SSE42-NEXT: testb $4, %al
344 ; SSE42-NEXT: je LBB4_6
345 ; SSE42-NEXT: LBB4_5: ## %cond.load4
346 ; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
347 ; SSE42-NEXT: testb $8, %al
348 ; SSE42-NEXT: jne LBB4_7
349 ; SSE42-NEXT: jmp LBB4_8
351 ; AVX1-LABEL: load_v4f64_v4i64:
353 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
354 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
355 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
356 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
357 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
358 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
359 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
362 ; AVX2-LABEL: load_v4f64_v4i64:
364 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
365 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0
366 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
367 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
370 ; AVX512F-LABEL: load_v4f64_v4i64:
372 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
373 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
374 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
375 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
376 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
377 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
378 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
381 ; AVX512VL-LABEL: load_v4f64_v4i64:
382 ; AVX512VL: ## %bb.0:
383 ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
384 ; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
385 ; AVX512VL-NEXT: retq
386 %mask = icmp eq <4 x i64> %trigger, zeroinitializer
387 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> %mask, <4 x double> %dst)
388 ret <4 x double> %res
391 define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <8 x double> %dst) {
392 ; SSE-LABEL: load_v8f64_v8i16:
394 ; SSE-NEXT: pxor %xmm5, %xmm5
395 ; SSE-NEXT: pcmpeqw %xmm0, %xmm5
396 ; SSE-NEXT: packsswb %xmm0, %xmm5
397 ; SSE-NEXT: pmovmskb %xmm5, %eax
398 ; SSE-NEXT: testb $1, %al
399 ; SSE-NEXT: jne LBB5_1
400 ; SSE-NEXT: ## %bb.2: ## %else
401 ; SSE-NEXT: testb $2, %al
402 ; SSE-NEXT: jne LBB5_3
403 ; SSE-NEXT: LBB5_4: ## %else2
404 ; SSE-NEXT: testb $4, %al
405 ; SSE-NEXT: jne LBB5_5
406 ; SSE-NEXT: LBB5_6: ## %else5
407 ; SSE-NEXT: testb $8, %al
408 ; SSE-NEXT: jne LBB5_7
409 ; SSE-NEXT: LBB5_8: ## %else8
410 ; SSE-NEXT: testb $16, %al
411 ; SSE-NEXT: jne LBB5_9
412 ; SSE-NEXT: LBB5_10: ## %else11
413 ; SSE-NEXT: testb $32, %al
414 ; SSE-NEXT: jne LBB5_11
415 ; SSE-NEXT: LBB5_12: ## %else14
416 ; SSE-NEXT: testb $64, %al
417 ; SSE-NEXT: jne LBB5_13
418 ; SSE-NEXT: LBB5_14: ## %else17
419 ; SSE-NEXT: testb $-128, %al
420 ; SSE-NEXT: je LBB5_16
421 ; SSE-NEXT: LBB5_15: ## %cond.load19
422 ; SSE-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
423 ; SSE-NEXT: LBB5_16: ## %else20
424 ; SSE-NEXT: movaps %xmm1, %xmm0
425 ; SSE-NEXT: movaps %xmm2, %xmm1
426 ; SSE-NEXT: movaps %xmm3, %xmm2
427 ; SSE-NEXT: movaps %xmm4, %xmm3
429 ; SSE-NEXT: LBB5_1: ## %cond.load
430 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
431 ; SSE-NEXT: testb $2, %al
432 ; SSE-NEXT: je LBB5_4
433 ; SSE-NEXT: LBB5_3: ## %cond.load1
434 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
435 ; SSE-NEXT: testb $4, %al
436 ; SSE-NEXT: je LBB5_6
437 ; SSE-NEXT: LBB5_5: ## %cond.load4
438 ; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
439 ; SSE-NEXT: testb $8, %al
440 ; SSE-NEXT: je LBB5_8
441 ; SSE-NEXT: LBB5_7: ## %cond.load7
442 ; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
443 ; SSE-NEXT: testb $16, %al
444 ; SSE-NEXT: je LBB5_10
445 ; SSE-NEXT: LBB5_9: ## %cond.load10
446 ; SSE-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
447 ; SSE-NEXT: testb $32, %al
448 ; SSE-NEXT: je LBB5_12
449 ; SSE-NEXT: LBB5_11: ## %cond.load13
450 ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
451 ; SSE-NEXT: testb $64, %al
452 ; SSE-NEXT: je LBB5_14
453 ; SSE-NEXT: LBB5_13: ## %cond.load16
454 ; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
455 ; SSE-NEXT: testb $-128, %al
456 ; SSE-NEXT: jne LBB5_15
457 ; SSE-NEXT: jmp LBB5_16
459 ; AVX1-LABEL: load_v8f64_v8i16:
461 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
462 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
463 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
464 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
465 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
466 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
467 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
468 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
469 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
470 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
471 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
472 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
473 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
474 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
475 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
476 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
477 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
480 ; AVX2-LABEL: load_v8f64_v8i16:
482 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
483 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
484 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
485 ; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
486 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
487 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
488 ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
489 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
490 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
491 ; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
492 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
495 ; AVX512F-LABEL: load_v8f64_v8i16:
497 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
498 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
499 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
500 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
501 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
504 ; AVX512VLDQ-LABEL: load_v8f64_v8i16:
505 ; AVX512VLDQ: ## %bb.0:
506 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
507 ; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
508 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
509 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
510 ; AVX512VLDQ-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
511 ; AVX512VLDQ-NEXT: retq
513 ; AVX512VLBW-LABEL: load_v8f64_v8i16:
514 ; AVX512VLBW: ## %bb.0:
515 ; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
516 ; AVX512VLBW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
517 ; AVX512VLBW-NEXT: retq
518 %mask = icmp eq <8 x i16> %trigger, zeroinitializer
519 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
520 ret <8 x double> %res
523 define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, <8 x double> %dst) {
524 ; SSE2-LABEL: load_v8f64_v8i64:
526 ; SSE2-NEXT: movdqa %xmm7, %xmm8
527 ; SSE2-NEXT: movaps %xmm6, %xmm9
528 ; SSE2-NEXT: pxor %xmm7, %xmm7
529 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
530 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2]
531 ; SSE2-NEXT: pand %xmm3, %xmm6
532 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
533 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
534 ; SSE2-NEXT: pand %xmm2, %xmm3
535 ; SSE2-NEXT: packssdw %xmm6, %xmm3
536 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm1
537 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
538 ; SSE2-NEXT: pand %xmm1, %xmm2
539 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm0
540 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
541 ; SSE2-NEXT: pand %xmm0, %xmm1
542 ; SSE2-NEXT: packssdw %xmm2, %xmm1
543 ; SSE2-NEXT: packssdw %xmm3, %xmm1
544 ; SSE2-NEXT: packsswb %xmm0, %xmm1
545 ; SSE2-NEXT: pmovmskb %xmm1, %eax
546 ; SSE2-NEXT: testb $1, %al
547 ; SSE2-NEXT: jne LBB6_1
548 ; SSE2-NEXT: ## %bb.2: ## %else
549 ; SSE2-NEXT: testb $2, %al
550 ; SSE2-NEXT: jne LBB6_3
551 ; SSE2-NEXT: LBB6_4: ## %else2
552 ; SSE2-NEXT: testb $4, %al
553 ; SSE2-NEXT: jne LBB6_5
554 ; SSE2-NEXT: LBB6_6: ## %else5
555 ; SSE2-NEXT: testb $8, %al
556 ; SSE2-NEXT: jne LBB6_7
557 ; SSE2-NEXT: LBB6_8: ## %else8
558 ; SSE2-NEXT: testb $16, %al
559 ; SSE2-NEXT: jne LBB6_9
560 ; SSE2-NEXT: LBB6_10: ## %else11
561 ; SSE2-NEXT: testb $32, %al
562 ; SSE2-NEXT: jne LBB6_11
563 ; SSE2-NEXT: LBB6_12: ## %else14
564 ; SSE2-NEXT: testb $64, %al
565 ; SSE2-NEXT: jne LBB6_13
566 ; SSE2-NEXT: LBB6_14: ## %else17
567 ; SSE2-NEXT: testb $-128, %al
568 ; SSE2-NEXT: je LBB6_16
569 ; SSE2-NEXT: LBB6_15: ## %cond.load19
570 ; SSE2-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
571 ; SSE2-NEXT: LBB6_16: ## %else20
572 ; SSE2-NEXT: movaps %xmm4, %xmm0
573 ; SSE2-NEXT: movaps %xmm5, %xmm1
574 ; SSE2-NEXT: movaps %xmm9, %xmm2
575 ; SSE2-NEXT: movaps %xmm8, %xmm3
577 ; SSE2-NEXT: LBB6_1: ## %cond.load
578 ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
579 ; SSE2-NEXT: testb $2, %al
580 ; SSE2-NEXT: je LBB6_4
581 ; SSE2-NEXT: LBB6_3: ## %cond.load1
582 ; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
583 ; SSE2-NEXT: testb $4, %al
584 ; SSE2-NEXT: je LBB6_6
585 ; SSE2-NEXT: LBB6_5: ## %cond.load4
586 ; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
587 ; SSE2-NEXT: testb $8, %al
588 ; SSE2-NEXT: je LBB6_8
589 ; SSE2-NEXT: LBB6_7: ## %cond.load7
590 ; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
591 ; SSE2-NEXT: testb $16, %al
592 ; SSE2-NEXT: je LBB6_10
593 ; SSE2-NEXT: LBB6_9: ## %cond.load10
594 ; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3]
595 ; SSE2-NEXT: testb $32, %al
596 ; SSE2-NEXT: je LBB6_12
597 ; SSE2-NEXT: LBB6_11: ## %cond.load13
598 ; SSE2-NEXT: movhps {{.*#+}} xmm9 = xmm9[0,1],mem[0,1]
599 ; SSE2-NEXT: testb $64, %al
600 ; SSE2-NEXT: je LBB6_14
601 ; SSE2-NEXT: LBB6_13: ## %cond.load16
602 ; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
603 ; SSE2-NEXT: testb $-128, %al
604 ; SSE2-NEXT: jne LBB6_15
605 ; SSE2-NEXT: jmp LBB6_16
607 ; SSE42-LABEL: load_v8f64_v8i64:
609 ; SSE42-NEXT: movdqa %xmm7, %xmm8
610 ; SSE42-NEXT: pxor %xmm7, %xmm7
611 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm3
612 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm2
613 ; SSE42-NEXT: packssdw %xmm3, %xmm2
614 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm1
615 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
616 ; SSE42-NEXT: packssdw %xmm1, %xmm0
617 ; SSE42-NEXT: packssdw %xmm2, %xmm0
618 ; SSE42-NEXT: packsswb %xmm0, %xmm0
619 ; SSE42-NEXT: pmovmskb %xmm0, %eax
620 ; SSE42-NEXT: testb $1, %al
621 ; SSE42-NEXT: jne LBB6_1
622 ; SSE42-NEXT: ## %bb.2: ## %else
623 ; SSE42-NEXT: testb $2, %al
624 ; SSE42-NEXT: jne LBB6_3
625 ; SSE42-NEXT: LBB6_4: ## %else2
626 ; SSE42-NEXT: testb $4, %al
627 ; SSE42-NEXT: jne LBB6_5
628 ; SSE42-NEXT: LBB6_6: ## %else5
629 ; SSE42-NEXT: testb $8, %al
630 ; SSE42-NEXT: jne LBB6_7
631 ; SSE42-NEXT: LBB6_8: ## %else8
632 ; SSE42-NEXT: testb $16, %al
633 ; SSE42-NEXT: jne LBB6_9
634 ; SSE42-NEXT: LBB6_10: ## %else11
635 ; SSE42-NEXT: testb $32, %al
636 ; SSE42-NEXT: jne LBB6_11
637 ; SSE42-NEXT: LBB6_12: ## %else14
638 ; SSE42-NEXT: testb $64, %al
639 ; SSE42-NEXT: jne LBB6_13
640 ; SSE42-NEXT: LBB6_14: ## %else17
641 ; SSE42-NEXT: testb $-128, %al
642 ; SSE42-NEXT: je LBB6_16
643 ; SSE42-NEXT: LBB6_15: ## %cond.load19
644 ; SSE42-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
645 ; SSE42-NEXT: LBB6_16: ## %else20
646 ; SSE42-NEXT: movaps %xmm4, %xmm0
647 ; SSE42-NEXT: movaps %xmm5, %xmm1
648 ; SSE42-NEXT: movaps %xmm6, %xmm2
649 ; SSE42-NEXT: movaps %xmm8, %xmm3
651 ; SSE42-NEXT: LBB6_1: ## %cond.load
652 ; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
653 ; SSE42-NEXT: testb $2, %al
654 ; SSE42-NEXT: je LBB6_4
655 ; SSE42-NEXT: LBB6_3: ## %cond.load1
656 ; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
657 ; SSE42-NEXT: testb $4, %al
658 ; SSE42-NEXT: je LBB6_6
659 ; SSE42-NEXT: LBB6_5: ## %cond.load4
660 ; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
661 ; SSE42-NEXT: testb $8, %al
662 ; SSE42-NEXT: je LBB6_8
663 ; SSE42-NEXT: LBB6_7: ## %cond.load7
664 ; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
665 ; SSE42-NEXT: testb $16, %al
666 ; SSE42-NEXT: je LBB6_10
667 ; SSE42-NEXT: LBB6_9: ## %cond.load10
668 ; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
669 ; SSE42-NEXT: testb $32, %al
670 ; SSE42-NEXT: je LBB6_12
671 ; SSE42-NEXT: LBB6_11: ## %cond.load13
672 ; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
673 ; SSE42-NEXT: testb $64, %al
674 ; SSE42-NEXT: je LBB6_14
675 ; SSE42-NEXT: LBB6_13: ## %cond.load16
676 ; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
677 ; SSE42-NEXT: testb $-128, %al
678 ; SSE42-NEXT: jne LBB6_15
679 ; SSE42-NEXT: jmp LBB6_16
681 ; AVX1-LABEL: load_v8f64_v8i64:
683 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
684 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
685 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
686 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
687 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
688 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
689 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
690 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
691 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
692 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
693 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
694 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
695 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
698 ; AVX2-LABEL: load_v8f64_v8i64:
700 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
701 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
702 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
703 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
704 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
705 ; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
706 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
709 ; AVX512-LABEL: load_v8f64_v8i64:
711 ; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
712 ; AVX512-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
714 %mask = icmp eq <8 x i64> %trigger, zeroinitializer
715 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
716 ret <8 x double> %res
723 define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
724 ; SSE2-LABEL: load_v2f32_v2i32:
726 ; SSE2-NEXT: pxor %xmm2, %xmm2
727 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
728 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
729 ; SSE2-NEXT: movmskpd %xmm0, %eax
730 ; SSE2-NEXT: testb $1, %al
731 ; SSE2-NEXT: jne LBB7_1
732 ; SSE2-NEXT: ## %bb.2: ## %else
733 ; SSE2-NEXT: testb $2, %al
734 ; SSE2-NEXT: jne LBB7_3
735 ; SSE2-NEXT: LBB7_4: ## %else2
736 ; SSE2-NEXT: movaps %xmm1, %xmm0
738 ; SSE2-NEXT: LBB7_1: ## %cond.load
739 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
740 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
741 ; SSE2-NEXT: testb $2, %al
742 ; SSE2-NEXT: je LBB7_4
743 ; SSE2-NEXT: LBB7_3: ## %cond.load1
744 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
745 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
746 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
747 ; SSE2-NEXT: movaps %xmm0, %xmm1
748 ; SSE2-NEXT: movaps %xmm1, %xmm0
751 ; SSE42-LABEL: load_v2f32_v2i32:
753 ; SSE42-NEXT: pxor %xmm2, %xmm2
754 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
755 ; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
756 ; SSE42-NEXT: movmskpd %xmm0, %eax
757 ; SSE42-NEXT: testb $1, %al
758 ; SSE42-NEXT: jne LBB7_1
759 ; SSE42-NEXT: ## %bb.2: ## %else
760 ; SSE42-NEXT: testb $2, %al
761 ; SSE42-NEXT: jne LBB7_3
762 ; SSE42-NEXT: LBB7_4: ## %else2
763 ; SSE42-NEXT: movaps %xmm1, %xmm0
765 ; SSE42-NEXT: LBB7_1: ## %cond.load
766 ; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
767 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
768 ; SSE42-NEXT: testb $2, %al
769 ; SSE42-NEXT: je LBB7_4
770 ; SSE42-NEXT: LBB7_3: ## %cond.load1
771 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
772 ; SSE42-NEXT: movaps %xmm1, %xmm0
775 ; AVX1OR2-LABEL: load_v2f32_v2i32:
777 ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
778 ; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
779 ; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
780 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
781 ; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
784 ; AVX512F-LABEL: load_v2f32_v2i32:
786 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
787 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
788 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
789 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
790 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
791 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
792 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
793 ; AVX512F-NEXT: vzeroupper
796 ; AVX512VLDQ-LABEL: load_v2f32_v2i32:
797 ; AVX512VLDQ: ## %bb.0:
798 ; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
799 ; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
800 ; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
801 ; AVX512VLDQ-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
802 ; AVX512VLDQ-NEXT: retq
804 ; AVX512VLBW-LABEL: load_v2f32_v2i32:
805 ; AVX512VLBW: ## %bb.0:
806 ; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
807 ; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
808 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
809 ; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
810 ; AVX512VLBW-NEXT: retq
811 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
812 %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
816 define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %addr) {
817 ; SSE2-LABEL: load_v2f32_v2i32_undef:
819 ; SSE2-NEXT: pxor %xmm1, %xmm1
820 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
821 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
822 ; SSE2-NEXT: movmskpd %xmm0, %eax
823 ; SSE2-NEXT: testb $1, %al
824 ; SSE2-NEXT: ## implicit-def: $xmm0
825 ; SSE2-NEXT: jne LBB8_1
826 ; SSE2-NEXT: ## %bb.2: ## %else
827 ; SSE2-NEXT: testb $2, %al
828 ; SSE2-NEXT: jne LBB8_3
829 ; SSE2-NEXT: LBB8_4: ## %else2
831 ; SSE2-NEXT: LBB8_1: ## %cond.load
832 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
833 ; SSE2-NEXT: testb $2, %al
834 ; SSE2-NEXT: je LBB8_4
835 ; SSE2-NEXT: LBB8_3: ## %cond.load1
836 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
837 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
838 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
839 ; SSE2-NEXT: movaps %xmm1, %xmm0
842 ; SSE42-LABEL: load_v2f32_v2i32_undef:
844 ; SSE42-NEXT: pxor %xmm1, %xmm1
845 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm1
846 ; SSE42-NEXT: pmovsxdq %xmm1, %xmm0
847 ; SSE42-NEXT: movmskpd %xmm0, %eax
848 ; SSE42-NEXT: testb $1, %al
849 ; SSE42-NEXT: ## implicit-def: $xmm0
850 ; SSE42-NEXT: jne LBB8_1
851 ; SSE42-NEXT: ## %bb.2: ## %else
852 ; SSE42-NEXT: testb $2, %al
853 ; SSE42-NEXT: jne LBB8_3
854 ; SSE42-NEXT: LBB8_4: ## %else2
856 ; SSE42-NEXT: LBB8_1: ## %cond.load
857 ; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
858 ; SSE42-NEXT: testb $2, %al
859 ; SSE42-NEXT: je LBB8_4
860 ; SSE42-NEXT: LBB8_3: ## %cond.load1
861 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
864 ; AVX1OR2-LABEL: load_v2f32_v2i32_undef:
866 ; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
867 ; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
868 ; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
869 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
872 ; AVX512F-LABEL: load_v2f32_v2i32_undef:
874 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
875 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
876 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
877 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
878 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
879 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
880 ; AVX512F-NEXT: vzeroupper
883 ; AVX512VLDQ-LABEL: load_v2f32_v2i32_undef:
884 ; AVX512VLDQ: ## %bb.0:
885 ; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
886 ; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
887 ; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
888 ; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
889 ; AVX512VLDQ-NEXT: retq
891 ; AVX512VLBW-LABEL: load_v2f32_v2i32_undef:
892 ; AVX512VLBW: ## %bb.0:
893 ; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
894 ; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
895 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
896 ; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
897 ; AVX512VLBW-NEXT: retq
898 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
899 %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float>undef)
903 define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
904 ; SSE2-LABEL: load_v4f32_v4i32:
906 ; SSE2-NEXT: pxor %xmm2, %xmm2
907 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
908 ; SSE2-NEXT: movmskps %xmm2, %eax
909 ; SSE2-NEXT: testb $1, %al
910 ; SSE2-NEXT: jne LBB9_1
911 ; SSE2-NEXT: ## %bb.2: ## %else
912 ; SSE2-NEXT: testb $2, %al
913 ; SSE2-NEXT: jne LBB9_3
914 ; SSE2-NEXT: LBB9_4: ## %else2
915 ; SSE2-NEXT: testb $4, %al
916 ; SSE2-NEXT: jne LBB9_5
917 ; SSE2-NEXT: LBB9_6: ## %else5
918 ; SSE2-NEXT: testb $8, %al
919 ; SSE2-NEXT: jne LBB9_7
920 ; SSE2-NEXT: LBB9_8: ## %else8
921 ; SSE2-NEXT: movaps %xmm1, %xmm0
923 ; SSE2-NEXT: LBB9_1: ## %cond.load
924 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
925 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
926 ; SSE2-NEXT: testb $2, %al
927 ; SSE2-NEXT: je LBB9_4
928 ; SSE2-NEXT: LBB9_3: ## %cond.load1
929 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
930 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
931 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
932 ; SSE2-NEXT: movaps %xmm0, %xmm1
933 ; SSE2-NEXT: testb $4, %al
934 ; SSE2-NEXT: je LBB9_6
935 ; SSE2-NEXT: LBB9_5: ## %cond.load4
936 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
937 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
938 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
939 ; SSE2-NEXT: testb $8, %al
940 ; SSE2-NEXT: je LBB9_8
941 ; SSE2-NEXT: LBB9_7: ## %cond.load7
942 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
943 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
944 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
945 ; SSE2-NEXT: movaps %xmm1, %xmm0
948 ; SSE42-LABEL: load_v4f32_v4i32:
950 ; SSE42-NEXT: pxor %xmm2, %xmm2
951 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
952 ; SSE42-NEXT: movmskps %xmm2, %eax
953 ; SSE42-NEXT: testb $1, %al
954 ; SSE42-NEXT: jne LBB9_1
955 ; SSE42-NEXT: ## %bb.2: ## %else
956 ; SSE42-NEXT: testb $2, %al
957 ; SSE42-NEXT: jne LBB9_3
958 ; SSE42-NEXT: LBB9_4: ## %else2
959 ; SSE42-NEXT: testb $4, %al
960 ; SSE42-NEXT: jne LBB9_5
961 ; SSE42-NEXT: LBB9_6: ## %else5
962 ; SSE42-NEXT: testb $8, %al
963 ; SSE42-NEXT: jne LBB9_7
964 ; SSE42-NEXT: LBB9_8: ## %else8
965 ; SSE42-NEXT: movaps %xmm1, %xmm0
967 ; SSE42-NEXT: LBB9_1: ## %cond.load
968 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
969 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
970 ; SSE42-NEXT: testb $2, %al
971 ; SSE42-NEXT: je LBB9_4
972 ; SSE42-NEXT: LBB9_3: ## %cond.load1
973 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
974 ; SSE42-NEXT: testb $4, %al
975 ; SSE42-NEXT: je LBB9_6
976 ; SSE42-NEXT: LBB9_5: ## %cond.load4
977 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
978 ; SSE42-NEXT: testb $8, %al
979 ; SSE42-NEXT: je LBB9_8
980 ; SSE42-NEXT: LBB9_7: ## %cond.load7
981 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
982 ; SSE42-NEXT: movaps %xmm1, %xmm0
985 ; AVX1OR2-LABEL: load_v4f32_v4i32:
987 ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
988 ; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
989 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
990 ; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
993 ; AVX512F-LABEL: load_v4f32_v4i32:
995 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
996 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
997 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
998 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
999 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1000 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
1001 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1002 ; AVX512F-NEXT: vzeroupper
1003 ; AVX512F-NEXT: retq
1005 ; AVX512VL-LABEL: load_v4f32_v4i32:
1006 ; AVX512VL: ## %bb.0:
1007 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
1008 ; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
1009 ; AVX512VL-NEXT: retq
1010 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
1011 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> %mask, <4 x float> %dst)
1012 ret <4 x float> %res
1015 define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) {
1016 ; SSE2-LABEL: load_v8f32_v8i1_zero:
1018 ; SSE2-NEXT: psllw $15, %xmm0
1019 ; SSE2-NEXT: packsswb %xmm0, %xmm0
1020 ; SSE2-NEXT: pmovmskb %xmm0, %eax
1021 ; SSE2-NEXT: pxor %xmm0, %xmm0
1022 ; SSE2-NEXT: testb $1, %al
1023 ; SSE2-NEXT: xorps %xmm1, %xmm1
1024 ; SSE2-NEXT: jne LBB10_1
1025 ; SSE2-NEXT: ## %bb.2: ## %else
1026 ; SSE2-NEXT: testb $2, %al
1027 ; SSE2-NEXT: jne LBB10_3
1028 ; SSE2-NEXT: LBB10_4: ## %else2
1029 ; SSE2-NEXT: testb $4, %al
1030 ; SSE2-NEXT: jne LBB10_5
1031 ; SSE2-NEXT: LBB10_6: ## %else5
1032 ; SSE2-NEXT: testb $8, %al
1033 ; SSE2-NEXT: jne LBB10_7
1034 ; SSE2-NEXT: LBB10_8: ## %else8
1035 ; SSE2-NEXT: testb $16, %al
1036 ; SSE2-NEXT: jne LBB10_9
1037 ; SSE2-NEXT: LBB10_10: ## %else11
1038 ; SSE2-NEXT: testb $32, %al
1039 ; SSE2-NEXT: jne LBB10_11
1040 ; SSE2-NEXT: LBB10_12: ## %else14
1041 ; SSE2-NEXT: testb $64, %al
1042 ; SSE2-NEXT: jne LBB10_13
1043 ; SSE2-NEXT: LBB10_14: ## %else17
1044 ; SSE2-NEXT: testb $-128, %al
1045 ; SSE2-NEXT: jne LBB10_15
1046 ; SSE2-NEXT: LBB10_16: ## %else20
1048 ; SSE2-NEXT: LBB10_1: ## %cond.load
1049 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1050 ; SSE2-NEXT: xorps %xmm1, %xmm1
1051 ; SSE2-NEXT: testb $2, %al
1052 ; SSE2-NEXT: je LBB10_4
1053 ; SSE2-NEXT: LBB10_3: ## %cond.load1
1054 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1055 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
1056 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
1057 ; SSE2-NEXT: movaps %xmm2, %xmm0
1058 ; SSE2-NEXT: testb $4, %al
1059 ; SSE2-NEXT: je LBB10_6
1060 ; SSE2-NEXT: LBB10_5: ## %cond.load4
1061 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1062 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
1063 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
1064 ; SSE2-NEXT: testb $8, %al
1065 ; SSE2-NEXT: je LBB10_8
1066 ; SSE2-NEXT: LBB10_7: ## %cond.load7
1067 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1068 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
1069 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1070 ; SSE2-NEXT: testb $16, %al
1071 ; SSE2-NEXT: je LBB10_10
1072 ; SSE2-NEXT: LBB10_9: ## %cond.load10
1073 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1074 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1075 ; SSE2-NEXT: testb $32, %al
1076 ; SSE2-NEXT: je LBB10_12
1077 ; SSE2-NEXT: LBB10_11: ## %cond.load13
1078 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1079 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
1080 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
1081 ; SSE2-NEXT: movaps %xmm2, %xmm1
1082 ; SSE2-NEXT: testb $64, %al
1083 ; SSE2-NEXT: je LBB10_14
1084 ; SSE2-NEXT: LBB10_13: ## %cond.load16
1085 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1086 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
1087 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
1088 ; SSE2-NEXT: testb $-128, %al
1089 ; SSE2-NEXT: je LBB10_16
1090 ; SSE2-NEXT: LBB10_15: ## %cond.load19
1091 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1092 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
1093 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1096 ; SSE42-LABEL: load_v8f32_v8i1_zero:
1098 ; SSE42-NEXT: psllw $15, %xmm0
1099 ; SSE42-NEXT: packsswb %xmm0, %xmm0
1100 ; SSE42-NEXT: pmovmskb %xmm0, %eax
1101 ; SSE42-NEXT: pxor %xmm0, %xmm0
1102 ; SSE42-NEXT: testb $1, %al
1103 ; SSE42-NEXT: xorps %xmm1, %xmm1
1104 ; SSE42-NEXT: jne LBB10_1
1105 ; SSE42-NEXT: ## %bb.2: ## %else
1106 ; SSE42-NEXT: testb $2, %al
1107 ; SSE42-NEXT: jne LBB10_3
1108 ; SSE42-NEXT: LBB10_4: ## %else2
1109 ; SSE42-NEXT: testb $4, %al
1110 ; SSE42-NEXT: jne LBB10_5
1111 ; SSE42-NEXT: LBB10_6: ## %else5
1112 ; SSE42-NEXT: testb $8, %al
1113 ; SSE42-NEXT: jne LBB10_7
1114 ; SSE42-NEXT: LBB10_8: ## %else8
1115 ; SSE42-NEXT: testb $16, %al
1116 ; SSE42-NEXT: jne LBB10_9
1117 ; SSE42-NEXT: LBB10_10: ## %else11
1118 ; SSE42-NEXT: testb $32, %al
1119 ; SSE42-NEXT: jne LBB10_11
1120 ; SSE42-NEXT: LBB10_12: ## %else14
1121 ; SSE42-NEXT: testb $64, %al
1122 ; SSE42-NEXT: jne LBB10_13
1123 ; SSE42-NEXT: LBB10_14: ## %else17
1124 ; SSE42-NEXT: testb $-128, %al
1125 ; SSE42-NEXT: jne LBB10_15
1126 ; SSE42-NEXT: LBB10_16: ## %else20
1128 ; SSE42-NEXT: LBB10_1: ## %cond.load
1129 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1130 ; SSE42-NEXT: xorps %xmm1, %xmm1
1131 ; SSE42-NEXT: testb $2, %al
1132 ; SSE42-NEXT: je LBB10_4
1133 ; SSE42-NEXT: LBB10_3: ## %cond.load1
1134 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1135 ; SSE42-NEXT: testb $4, %al
1136 ; SSE42-NEXT: je LBB10_6
1137 ; SSE42-NEXT: LBB10_5: ## %cond.load4
1138 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1139 ; SSE42-NEXT: testb $8, %al
1140 ; SSE42-NEXT: je LBB10_8
1141 ; SSE42-NEXT: LBB10_7: ## %cond.load7
1142 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1143 ; SSE42-NEXT: testb $16, %al
1144 ; SSE42-NEXT: je LBB10_10
1145 ; SSE42-NEXT: LBB10_9: ## %cond.load10
1146 ; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1147 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1148 ; SSE42-NEXT: testb $32, %al
1149 ; SSE42-NEXT: je LBB10_12
1150 ; SSE42-NEXT: LBB10_11: ## %cond.load13
1151 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
1152 ; SSE42-NEXT: testb $64, %al
1153 ; SSE42-NEXT: je LBB10_14
1154 ; SSE42-NEXT: LBB10_13: ## %cond.load16
1155 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
1156 ; SSE42-NEXT: testb $-128, %al
1157 ; SSE42-NEXT: je LBB10_16
1158 ; SSE42-NEXT: LBB10_15: ## %cond.load19
1159 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
1162 ; AVX1-LABEL: load_v8f32_v8i1_zero:
1164 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1165 ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
1166 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
1167 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1168 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
1169 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
1170 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1171 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
1174 ; AVX2-LABEL: load_v8f32_v8i1_zero:
1176 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1177 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
1178 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
1179 ; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
1182 ; AVX512F-LABEL: load_v8f32_v8i1_zero:
1183 ; AVX512F: ## %bb.0:
1184 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
1185 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
1186 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
1187 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
1188 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1189 ; AVX512F-NEXT: retq
1191 ; AVX512VLDQ-LABEL: load_v8f32_v8i1_zero:
1192 ; AVX512VLDQ: ## %bb.0:
1193 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
1194 ; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
1195 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
1196 ; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
1197 ; AVX512VLDQ-NEXT: retq
1199 ; AVX512VLBW-LABEL: load_v8f32_v8i1_zero:
1200 ; AVX512VLBW: ## %bb.0:
1201 ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
1202 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
1203 ; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
1204 ; AVX512VLBW-NEXT: retq
1205 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
1206 ret <8 x float> %res
1209 define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
1210 ; SSE2-LABEL: load_v8f32_v8i32:
1212 ; SSE2-NEXT: pxor %xmm4, %xmm4
1213 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
1214 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
1215 ; SSE2-NEXT: packssdw %xmm1, %xmm0
1216 ; SSE2-NEXT: packsswb %xmm0, %xmm0
1217 ; SSE2-NEXT: pmovmskb %xmm0, %eax
1218 ; SSE2-NEXT: testb $1, %al
1219 ; SSE2-NEXT: jne LBB11_1
1220 ; SSE2-NEXT: ## %bb.2: ## %else
1221 ; SSE2-NEXT: testb $2, %al
1222 ; SSE2-NEXT: jne LBB11_3
1223 ; SSE2-NEXT: LBB11_4: ## %else2
1224 ; SSE2-NEXT: testb $4, %al
1225 ; SSE2-NEXT: jne LBB11_5
1226 ; SSE2-NEXT: LBB11_6: ## %else5
1227 ; SSE2-NEXT: testb $8, %al
1228 ; SSE2-NEXT: jne LBB11_7
1229 ; SSE2-NEXT: LBB11_8: ## %else8
1230 ; SSE2-NEXT: testb $16, %al
1231 ; SSE2-NEXT: jne LBB11_9
1232 ; SSE2-NEXT: LBB11_10: ## %else11
1233 ; SSE2-NEXT: testb $32, %al
1234 ; SSE2-NEXT: jne LBB11_11
1235 ; SSE2-NEXT: LBB11_12: ## %else14
1236 ; SSE2-NEXT: testb $64, %al
1237 ; SSE2-NEXT: jne LBB11_13
1238 ; SSE2-NEXT: LBB11_14: ## %else17
1239 ; SSE2-NEXT: testb $-128, %al
1240 ; SSE2-NEXT: je LBB11_16
1241 ; SSE2-NEXT: LBB11_15: ## %cond.load19
1242 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1243 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0]
1244 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
1245 ; SSE2-NEXT: LBB11_16: ## %else20
1246 ; SSE2-NEXT: movaps %xmm2, %xmm0
1247 ; SSE2-NEXT: movaps %xmm3, %xmm1
1249 ; SSE2-NEXT: LBB11_1: ## %cond.load
1250 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1251 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1252 ; SSE2-NEXT: testb $2, %al
1253 ; SSE2-NEXT: je LBB11_4
1254 ; SSE2-NEXT: LBB11_3: ## %cond.load1
1255 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1256 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
1257 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1258 ; SSE2-NEXT: movaps %xmm0, %xmm2
1259 ; SSE2-NEXT: testb $4, %al
1260 ; SSE2-NEXT: je LBB11_6
1261 ; SSE2-NEXT: LBB11_5: ## %cond.load4
1262 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1263 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
1264 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
1265 ; SSE2-NEXT: testb $8, %al
1266 ; SSE2-NEXT: je LBB11_8
1267 ; SSE2-NEXT: LBB11_7: ## %cond.load7
1268 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1269 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
1270 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
1271 ; SSE2-NEXT: testb $16, %al
1272 ; SSE2-NEXT: je LBB11_10
1273 ; SSE2-NEXT: LBB11_9: ## %cond.load10
1274 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1275 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
1276 ; SSE2-NEXT: testb $32, %al
1277 ; SSE2-NEXT: je LBB11_12
1278 ; SSE2-NEXT: LBB11_11: ## %cond.load13
1279 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1280 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[0,0]
1281 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
1282 ; SSE2-NEXT: movaps %xmm0, %xmm3
1283 ; SSE2-NEXT: testb $64, %al
1284 ; SSE2-NEXT: je LBB11_14
1285 ; SSE2-NEXT: LBB11_13: ## %cond.load16
1286 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1287 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0]
1288 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
1289 ; SSE2-NEXT: testb $-128, %al
1290 ; SSE2-NEXT: jne LBB11_15
1291 ; SSE2-NEXT: jmp LBB11_16
1293 ; SSE42-LABEL: load_v8f32_v8i32:
1295 ; SSE42-NEXT: pxor %xmm4, %xmm4
1296 ; SSE42-NEXT: pcmpeqd %xmm4, %xmm1
1297 ; SSE42-NEXT: pcmpeqd %xmm4, %xmm0
1298 ; SSE42-NEXT: packssdw %xmm1, %xmm0
1299 ; SSE42-NEXT: packsswb %xmm0, %xmm0
1300 ; SSE42-NEXT: pmovmskb %xmm0, %eax
1301 ; SSE42-NEXT: testb $1, %al
1302 ; SSE42-NEXT: jne LBB11_1
1303 ; SSE42-NEXT: ## %bb.2: ## %else
1304 ; SSE42-NEXT: testb $2, %al
1305 ; SSE42-NEXT: jne LBB11_3
1306 ; SSE42-NEXT: LBB11_4: ## %else2
1307 ; SSE42-NEXT: testb $4, %al
1308 ; SSE42-NEXT: jne LBB11_5
1309 ; SSE42-NEXT: LBB11_6: ## %else5
1310 ; SSE42-NEXT: testb $8, %al
1311 ; SSE42-NEXT: jne LBB11_7
1312 ; SSE42-NEXT: LBB11_8: ## %else8
1313 ; SSE42-NEXT: testb $16, %al
1314 ; SSE42-NEXT: jne LBB11_9
1315 ; SSE42-NEXT: LBB11_10: ## %else11
1316 ; SSE42-NEXT: testb $32, %al
1317 ; SSE42-NEXT: jne LBB11_11
1318 ; SSE42-NEXT: LBB11_12: ## %else14
1319 ; SSE42-NEXT: testb $64, %al
1320 ; SSE42-NEXT: jne LBB11_13
1321 ; SSE42-NEXT: LBB11_14: ## %else17
1322 ; SSE42-NEXT: testb $-128, %al
1323 ; SSE42-NEXT: je LBB11_16
1324 ; SSE42-NEXT: LBB11_15: ## %cond.load19
1325 ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
1326 ; SSE42-NEXT: LBB11_16: ## %else20
1327 ; SSE42-NEXT: movaps %xmm2, %xmm0
1328 ; SSE42-NEXT: movaps %xmm3, %xmm1
1330 ; SSE42-NEXT: LBB11_1: ## %cond.load
1331 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1332 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1333 ; SSE42-NEXT: testb $2, %al
1334 ; SSE42-NEXT: je LBB11_4
1335 ; SSE42-NEXT: LBB11_3: ## %cond.load1
1336 ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1337 ; SSE42-NEXT: testb $4, %al
1338 ; SSE42-NEXT: je LBB11_6
1339 ; SSE42-NEXT: LBB11_5: ## %cond.load4
1340 ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
1341 ; SSE42-NEXT: testb $8, %al
1342 ; SSE42-NEXT: je LBB11_8
1343 ; SSE42-NEXT: LBB11_7: ## %cond.load7
1344 ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
1345 ; SSE42-NEXT: testb $16, %al
1346 ; SSE42-NEXT: je LBB11_10
1347 ; SSE42-NEXT: LBB11_9: ## %cond.load10
1348 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1349 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
1350 ; SSE42-NEXT: testb $32, %al
1351 ; SSE42-NEXT: je LBB11_12
1352 ; SSE42-NEXT: LBB11_11: ## %cond.load13
1353 ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
1354 ; SSE42-NEXT: testb $64, %al
1355 ; SSE42-NEXT: je LBB11_14
1356 ; SSE42-NEXT: LBB11_13: ## %cond.load16
1357 ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
1358 ; SSE42-NEXT: testb $-128, %al
1359 ; SSE42-NEXT: jne LBB11_15
1360 ; SSE42-NEXT: jmp LBB11_16
1362 ; AVX1-LABEL: load_v8f32_v8i32:
1364 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1365 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1366 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1367 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
1368 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1369 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
1370 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
1373 ; AVX2-LABEL: load_v8f32_v8i32:
1375 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1376 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
1377 ; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
1378 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
1381 ; AVX512F-LABEL: load_v8f32_v8i32:
1382 ; AVX512F: ## %bb.0:
1383 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1384 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1385 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
1386 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0
1387 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1
1388 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
1389 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1390 ; AVX512F-NEXT: retq
1392 ; AVX512VL-LABEL: load_v8f32_v8i32:
1393 ; AVX512VL: ## %bb.0:
1394 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
1395 ; AVX512VL-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
1396 ; AVX512VL-NEXT: retq
1397 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
1398 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> %dst)
1399 ret <8 x float> %res
1407 define <1 x i64> @load_v1i64_v1i64(<1 x i64> %trigger, <1 x i64>* %addr, <1 x i64> %dst) {
1408 ; SSE-LABEL: load_v1i64_v1i64:
1410 ; SSE-NEXT: testq %rdi, %rdi
1411 ; SSE-NEXT: jne LBB12_1
1412 ; SSE-NEXT: ## %bb.2: ## %cond.load
1413 ; SSE-NEXT: movq (%rsi), %rax
1415 ; SSE-NEXT: LBB12_1:
1416 ; SSE-NEXT: movq %rdx, %rax
1419 ; AVX-LABEL: load_v1i64_v1i64:
1421 ; AVX-NEXT: testq %rdi, %rdi
1422 ; AVX-NEXT: jne LBB12_1
1423 ; AVX-NEXT: ## %bb.2: ## %cond.load
1424 ; AVX-NEXT: movq (%rsi), %rax
1426 ; AVX-NEXT: LBB12_1:
1427 ; AVX-NEXT: movq %rdx, %rax
1429 %mask = icmp eq <1 x i64> %trigger, zeroinitializer
1430 %res = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* %addr, i32 4, <1 x i1> %mask, <1 x i64> %dst)
1434 define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %dst) {
1435 ; SSE2-LABEL: load_v2i64_v2i64:
1437 ; SSE2-NEXT: pxor %xmm2, %xmm2
1438 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
1439 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
1440 ; SSE2-NEXT: pand %xmm2, %xmm0
1441 ; SSE2-NEXT: movmskpd %xmm0, %eax
1442 ; SSE2-NEXT: testb $1, %al
1443 ; SSE2-NEXT: jne LBB13_1
1444 ; SSE2-NEXT: ## %bb.2: ## %else
1445 ; SSE2-NEXT: testb $2, %al
1446 ; SSE2-NEXT: jne LBB13_3
1447 ; SSE2-NEXT: LBB13_4: ## %else2
1448 ; SSE2-NEXT: movaps %xmm1, %xmm0
1450 ; SSE2-NEXT: LBB13_1: ## %cond.load
1451 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
1452 ; SSE2-NEXT: testb $2, %al
1453 ; SSE2-NEXT: je LBB13_4
1454 ; SSE2-NEXT: LBB13_3: ## %cond.load1
1455 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1456 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1457 ; SSE2-NEXT: movaps %xmm1, %xmm0
1460 ; SSE42-LABEL: load_v2i64_v2i64:
1462 ; SSE42-NEXT: pxor %xmm2, %xmm2
1463 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2
1464 ; SSE42-NEXT: movmskpd %xmm2, %eax
1465 ; SSE42-NEXT: testb $1, %al
1466 ; SSE42-NEXT: jne LBB13_1
1467 ; SSE42-NEXT: ## %bb.2: ## %else
1468 ; SSE42-NEXT: testb $2, %al
1469 ; SSE42-NEXT: jne LBB13_3
1470 ; SSE42-NEXT: LBB13_4: ## %else2
1471 ; SSE42-NEXT: movdqa %xmm1, %xmm0
1473 ; SSE42-NEXT: LBB13_1: ## %cond.load
1474 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1
1475 ; SSE42-NEXT: testb $2, %al
1476 ; SSE42-NEXT: je LBB13_4
1477 ; SSE42-NEXT: LBB13_3: ## %cond.load1
1478 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1
1479 ; SSE42-NEXT: movdqa %xmm1, %xmm0
1482 ; AVX1-LABEL: load_v2i64_v2i64:
1484 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1485 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
1486 ; AVX1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
1487 ; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
1490 ; AVX2-LABEL: load_v2i64_v2i64:
1492 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1493 ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
1494 ; AVX2-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2
1495 ; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
1498 ; AVX512F-LABEL: load_v2i64_v2i64:
1499 ; AVX512F: ## %bb.0:
1500 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1501 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1502 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
1503 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
1504 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
1505 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1506 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1507 ; AVX512F-NEXT: vzeroupper
1508 ; AVX512F-NEXT: retq
1510 ; AVX512VL-LABEL: load_v2i64_v2i64:
1511 ; AVX512VL: ## %bb.0:
1512 ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
1513 ; AVX512VL-NEXT: vpblendmq (%rdi), %xmm1, %xmm0 {%k1}
1514 ; AVX512VL-NEXT: retq
1515 %mask = icmp eq <2 x i64> %trigger, zeroinitializer
1516 %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst)
1520 define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %dst) {
1521 ; SSE2-LABEL: load_v4i64_v4i64:
1523 ; SSE2-NEXT: pxor %xmm4, %xmm4
1524 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
1525 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2]
1526 ; SSE2-NEXT: pand %xmm1, %xmm5
1527 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
1528 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
1529 ; SSE2-NEXT: pand %xmm0, %xmm1
1530 ; SSE2-NEXT: packssdw %xmm5, %xmm1
1531 ; SSE2-NEXT: movmskps %xmm1, %eax
1532 ; SSE2-NEXT: testb $1, %al
1533 ; SSE2-NEXT: jne LBB14_1
1534 ; SSE2-NEXT: ## %bb.2: ## %else
1535 ; SSE2-NEXT: testb $2, %al
1536 ; SSE2-NEXT: jne LBB14_3
1537 ; SSE2-NEXT: LBB14_4: ## %else2
1538 ; SSE2-NEXT: testb $4, %al
1539 ; SSE2-NEXT: jne LBB14_5
1540 ; SSE2-NEXT: LBB14_6: ## %else5
1541 ; SSE2-NEXT: testb $8, %al
1542 ; SSE2-NEXT: je LBB14_8
1543 ; SSE2-NEXT: LBB14_7: ## %cond.load7
1544 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1545 ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1546 ; SSE2-NEXT: LBB14_8: ## %else8
1547 ; SSE2-NEXT: movaps %xmm2, %xmm0
1548 ; SSE2-NEXT: movaps %xmm3, %xmm1
1550 ; SSE2-NEXT: LBB14_1: ## %cond.load
1551 ; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1552 ; SSE2-NEXT: testb $2, %al
1553 ; SSE2-NEXT: je LBB14_4
1554 ; SSE2-NEXT: LBB14_3: ## %cond.load1
1555 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1556 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1557 ; SSE2-NEXT: testb $4, %al
1558 ; SSE2-NEXT: je LBB14_6
1559 ; SSE2-NEXT: LBB14_5: ## %cond.load4
1560 ; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
1561 ; SSE2-NEXT: testb $8, %al
1562 ; SSE2-NEXT: jne LBB14_7
1563 ; SSE2-NEXT: jmp LBB14_8
1565 ; SSE42-LABEL: load_v4i64_v4i64:
1567 ; SSE42-NEXT: pxor %xmm4, %xmm4
1568 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm1
1569 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm0
1570 ; SSE42-NEXT: packssdw %xmm1, %xmm0
1571 ; SSE42-NEXT: movmskps %xmm0, %eax
1572 ; SSE42-NEXT: testb $1, %al
1573 ; SSE42-NEXT: jne LBB14_1
1574 ; SSE42-NEXT: ## %bb.2: ## %else
1575 ; SSE42-NEXT: testb $2, %al
1576 ; SSE42-NEXT: jne LBB14_3
1577 ; SSE42-NEXT: LBB14_4: ## %else2
1578 ; SSE42-NEXT: testb $4, %al
1579 ; SSE42-NEXT: jne LBB14_5
1580 ; SSE42-NEXT: LBB14_6: ## %else5
1581 ; SSE42-NEXT: testb $8, %al
1582 ; SSE42-NEXT: je LBB14_8
1583 ; SSE42-NEXT: LBB14_7: ## %cond.load7
1584 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm3
1585 ; SSE42-NEXT: LBB14_8: ## %else8
1586 ; SSE42-NEXT: movdqa %xmm2, %xmm0
1587 ; SSE42-NEXT: movdqa %xmm3, %xmm1
1589 ; SSE42-NEXT: LBB14_1: ## %cond.load
1590 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm2
1591 ; SSE42-NEXT: testb $2, %al
1592 ; SSE42-NEXT: je LBB14_4
1593 ; SSE42-NEXT: LBB14_3: ## %cond.load1
1594 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm2
1595 ; SSE42-NEXT: testb $4, %al
1596 ; SSE42-NEXT: je LBB14_6
1597 ; SSE42-NEXT: LBB14_5: ## %cond.load4
1598 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm3
1599 ; SSE42-NEXT: testb $8, %al
1600 ; SSE42-NEXT: jne LBB14_7
1601 ; SSE42-NEXT: jmp LBB14_8
1603 ; AVX1-LABEL: load_v4i64_v4i64:
1605 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1606 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1607 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1608 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
1609 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1610 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
1611 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
1614 ; AVX2-LABEL: load_v4i64_v4i64:
1616 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1617 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0
1618 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2
1619 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
1622 ; AVX512F-LABEL: load_v4i64_v4i64:
1623 ; AVX512F: ## %bb.0:
1624 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1625 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1626 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
1627 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
1628 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1629 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1630 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1631 ; AVX512F-NEXT: retq
1633 ; AVX512VL-LABEL: load_v4i64_v4i64:
1634 ; AVX512VL: ## %bb.0:
1635 ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
1636 ; AVX512VL-NEXT: vpblendmq (%rdi), %ymm1, %ymm0 {%k1}
1637 ; AVX512VL-NEXT: retq
1638 %mask = icmp eq <4 x i64> %trigger, zeroinitializer
1639 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst)
1643 define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i64> %dst) {
1644 ; SSE2-LABEL: load_v8i64_v8i16:
1646 ; SSE2-NEXT: pxor %xmm5, %xmm5
1647 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm5
1648 ; SSE2-NEXT: packsswb %xmm0, %xmm5
1649 ; SSE2-NEXT: pmovmskb %xmm5, %eax
1650 ; SSE2-NEXT: testb $1, %al
1651 ; SSE2-NEXT: jne LBB15_1
1652 ; SSE2-NEXT: ## %bb.2: ## %else
1653 ; SSE2-NEXT: testb $2, %al
1654 ; SSE2-NEXT: jne LBB15_3
1655 ; SSE2-NEXT: LBB15_4: ## %else2
1656 ; SSE2-NEXT: testb $4, %al
1657 ; SSE2-NEXT: jne LBB15_5
1658 ; SSE2-NEXT: LBB15_6: ## %else5
1659 ; SSE2-NEXT: testb $8, %al
1660 ; SSE2-NEXT: jne LBB15_7
1661 ; SSE2-NEXT: LBB15_8: ## %else8
1662 ; SSE2-NEXT: testb $16, %al
1663 ; SSE2-NEXT: jne LBB15_9
1664 ; SSE2-NEXT: LBB15_10: ## %else11
1665 ; SSE2-NEXT: testb $32, %al
1666 ; SSE2-NEXT: jne LBB15_11
1667 ; SSE2-NEXT: LBB15_12: ## %else14
1668 ; SSE2-NEXT: testb $64, %al
1669 ; SSE2-NEXT: jne LBB15_13
1670 ; SSE2-NEXT: LBB15_14: ## %else17
1671 ; SSE2-NEXT: testb $-128, %al
1672 ; SSE2-NEXT: je LBB15_16
1673 ; SSE2-NEXT: LBB15_15: ## %cond.load19
1674 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1675 ; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
1676 ; SSE2-NEXT: LBB15_16: ## %else20
1677 ; SSE2-NEXT: movaps %xmm1, %xmm0
1678 ; SSE2-NEXT: movaps %xmm2, %xmm1
1679 ; SSE2-NEXT: movaps %xmm3, %xmm2
1680 ; SSE2-NEXT: movaps %xmm4, %xmm3
1682 ; SSE2-NEXT: LBB15_1: ## %cond.load
1683 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
1684 ; SSE2-NEXT: testb $2, %al
1685 ; SSE2-NEXT: je LBB15_4
1686 ; SSE2-NEXT: LBB15_3: ## %cond.load1
1687 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1688 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1689 ; SSE2-NEXT: testb $4, %al
1690 ; SSE2-NEXT: je LBB15_6
1691 ; SSE2-NEXT: LBB15_5: ## %cond.load4
1692 ; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1693 ; SSE2-NEXT: testb $8, %al
1694 ; SSE2-NEXT: je LBB15_8
1695 ; SSE2-NEXT: LBB15_7: ## %cond.load7
1696 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1697 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1698 ; SSE2-NEXT: testb $16, %al
1699 ; SSE2-NEXT: je LBB15_10
1700 ; SSE2-NEXT: LBB15_9: ## %cond.load10
1701 ; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
1702 ; SSE2-NEXT: testb $32, %al
1703 ; SSE2-NEXT: je LBB15_12
1704 ; SSE2-NEXT: LBB15_11: ## %cond.load13
1705 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1706 ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1707 ; SSE2-NEXT: testb $64, %al
1708 ; SSE2-NEXT: je LBB15_14
1709 ; SSE2-NEXT: LBB15_13: ## %cond.load16
1710 ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
1711 ; SSE2-NEXT: testb $-128, %al
1712 ; SSE2-NEXT: jne LBB15_15
1713 ; SSE2-NEXT: jmp LBB15_16
1715 ; SSE42-LABEL: load_v8i64_v8i16:
1717 ; SSE42-NEXT: pxor %xmm5, %xmm5
1718 ; SSE42-NEXT: pcmpeqw %xmm0, %xmm5
1719 ; SSE42-NEXT: packsswb %xmm0, %xmm5
1720 ; SSE42-NEXT: pmovmskb %xmm5, %eax
1721 ; SSE42-NEXT: testb $1, %al
1722 ; SSE42-NEXT: jne LBB15_1
1723 ; SSE42-NEXT: ## %bb.2: ## %else
1724 ; SSE42-NEXT: testb $2, %al
1725 ; SSE42-NEXT: jne LBB15_3
1726 ; SSE42-NEXT: LBB15_4: ## %else2
1727 ; SSE42-NEXT: testb $4, %al
1728 ; SSE42-NEXT: jne LBB15_5
1729 ; SSE42-NEXT: LBB15_6: ## %else5
1730 ; SSE42-NEXT: testb $8, %al
1731 ; SSE42-NEXT: jne LBB15_7
1732 ; SSE42-NEXT: LBB15_8: ## %else8
1733 ; SSE42-NEXT: testb $16, %al
1734 ; SSE42-NEXT: jne LBB15_9
1735 ; SSE42-NEXT: LBB15_10: ## %else11
1736 ; SSE42-NEXT: testb $32, %al
1737 ; SSE42-NEXT: jne LBB15_11
1738 ; SSE42-NEXT: LBB15_12: ## %else14
1739 ; SSE42-NEXT: testb $64, %al
1740 ; SSE42-NEXT: jne LBB15_13
1741 ; SSE42-NEXT: LBB15_14: ## %else17
1742 ; SSE42-NEXT: testb $-128, %al
1743 ; SSE42-NEXT: je LBB15_16
1744 ; SSE42-NEXT: LBB15_15: ## %cond.load19
1745 ; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm4
1746 ; SSE42-NEXT: LBB15_16: ## %else20
1747 ; SSE42-NEXT: movdqa %xmm1, %xmm0
1748 ; SSE42-NEXT: movdqa %xmm2, %xmm1
1749 ; SSE42-NEXT: movdqa %xmm3, %xmm2
1750 ; SSE42-NEXT: movdqa %xmm4, %xmm3
1752 ; SSE42-NEXT: LBB15_1: ## %cond.load
1753 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1
1754 ; SSE42-NEXT: testb $2, %al
1755 ; SSE42-NEXT: je LBB15_4
1756 ; SSE42-NEXT: LBB15_3: ## %cond.load1
1757 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1
1758 ; SSE42-NEXT: testb $4, %al
1759 ; SSE42-NEXT: je LBB15_6
1760 ; SSE42-NEXT: LBB15_5: ## %cond.load4
1761 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm2
1762 ; SSE42-NEXT: testb $8, %al
1763 ; SSE42-NEXT: je LBB15_8
1764 ; SSE42-NEXT: LBB15_7: ## %cond.load7
1765 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm2
1766 ; SSE42-NEXT: testb $16, %al
1767 ; SSE42-NEXT: je LBB15_10
1768 ; SSE42-NEXT: LBB15_9: ## %cond.load10
1769 ; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm3
1770 ; SSE42-NEXT: testb $32, %al
1771 ; SSE42-NEXT: je LBB15_12
1772 ; SSE42-NEXT: LBB15_11: ## %cond.load13
1773 ; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm3
1774 ; SSE42-NEXT: testb $64, %al
1775 ; SSE42-NEXT: je LBB15_14
1776 ; SSE42-NEXT: LBB15_13: ## %cond.load16
1777 ; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm4
1778 ; SSE42-NEXT: testb $-128, %al
1779 ; SSE42-NEXT: jne LBB15_15
1780 ; SSE42-NEXT: jmp LBB15_16
1782 ; AVX1-LABEL: load_v8i64_v8i16:
1784 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
1785 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
1786 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
1787 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
1788 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
1789 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1790 ; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
1791 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1792 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
1793 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
1794 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1795 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
1796 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
1797 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
1798 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
1799 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
1800 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1803 ; AVX2-LABEL: load_v8i64_v8i16:
1805 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
1806 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1807 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
1808 ; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
1809 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
1810 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
1811 ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
1812 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
1813 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
1814 ; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1
1815 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1818 ; AVX512F-LABEL: load_v8i64_v8i16:
1819 ; AVX512F: ## %bb.0:
1820 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1821 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
1822 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
1823 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
1824 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1825 ; AVX512F-NEXT: retq
1827 ; AVX512VLDQ-LABEL: load_v8i64_v8i16:
1828 ; AVX512VLDQ: ## %bb.0:
1829 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
1830 ; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
1831 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
1832 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
1833 ; AVX512VLDQ-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1834 ; AVX512VLDQ-NEXT: retq
1836 ; AVX512VLBW-LABEL: load_v8i64_v8i16:
1837 ; AVX512VLBW: ## %bb.0:
1838 ; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
1839 ; AVX512VLBW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1840 ; AVX512VLBW-NEXT: retq
1841 %mask = icmp eq <8 x i16> %trigger, zeroinitializer
1842 %res = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
1846 define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i64> %dst) {
1847 ; SSE2-LABEL: load_v8i64_v8i64:
1849 ; SSE2-NEXT: movdqa %xmm7, %xmm8
1850 ; SSE2-NEXT: movaps %xmm6, %xmm9
1851 ; SSE2-NEXT: pxor %xmm7, %xmm7
1852 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
1853 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2]
1854 ; SSE2-NEXT: pand %xmm3, %xmm6
1855 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm2
1856 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
1857 ; SSE2-NEXT: pand %xmm2, %xmm3
1858 ; SSE2-NEXT: packssdw %xmm6, %xmm3
1859 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm1
1860 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
1861 ; SSE2-NEXT: pand %xmm1, %xmm2
1862 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm0
1863 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
1864 ; SSE2-NEXT: pand %xmm0, %xmm1
1865 ; SSE2-NEXT: packssdw %xmm2, %xmm1
1866 ; SSE2-NEXT: packssdw %xmm3, %xmm1
1867 ; SSE2-NEXT: packsswb %xmm0, %xmm1
1868 ; SSE2-NEXT: pmovmskb %xmm1, %eax
1869 ; SSE2-NEXT: testb $1, %al
1870 ; SSE2-NEXT: jne LBB16_1
1871 ; SSE2-NEXT: ## %bb.2: ## %else
1872 ; SSE2-NEXT: testb $2, %al
1873 ; SSE2-NEXT: jne LBB16_3
1874 ; SSE2-NEXT: LBB16_4: ## %else2
1875 ; SSE2-NEXT: testb $4, %al
1876 ; SSE2-NEXT: jne LBB16_5
1877 ; SSE2-NEXT: LBB16_6: ## %else5
1878 ; SSE2-NEXT: testb $8, %al
1879 ; SSE2-NEXT: jne LBB16_7
1880 ; SSE2-NEXT: LBB16_8: ## %else8
1881 ; SSE2-NEXT: testb $16, %al
1882 ; SSE2-NEXT: jne LBB16_9
1883 ; SSE2-NEXT: LBB16_10: ## %else11
1884 ; SSE2-NEXT: testb $32, %al
1885 ; SSE2-NEXT: jne LBB16_11
1886 ; SSE2-NEXT: LBB16_12: ## %else14
1887 ; SSE2-NEXT: testb $64, %al
1888 ; SSE2-NEXT: jne LBB16_13
1889 ; SSE2-NEXT: LBB16_14: ## %else17
1890 ; SSE2-NEXT: testb $-128, %al
1891 ; SSE2-NEXT: je LBB16_16
1892 ; SSE2-NEXT: LBB16_15: ## %cond.load19
1893 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1894 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0]
1895 ; SSE2-NEXT: LBB16_16: ## %else20
1896 ; SSE2-NEXT: movaps %xmm4, %xmm0
1897 ; SSE2-NEXT: movaps %xmm5, %xmm1
1898 ; SSE2-NEXT: movaps %xmm9, %xmm2
1899 ; SSE2-NEXT: movdqa %xmm8, %xmm3
1901 ; SSE2-NEXT: LBB16_1: ## %cond.load
1902 ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
1903 ; SSE2-NEXT: testb $2, %al
1904 ; SSE2-NEXT: je LBB16_4
1905 ; SSE2-NEXT: LBB16_3: ## %cond.load1
1906 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1907 ; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
1908 ; SSE2-NEXT: testb $4, %al
1909 ; SSE2-NEXT: je LBB16_6
1910 ; SSE2-NEXT: LBB16_5: ## %cond.load4
1911 ; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
1912 ; SSE2-NEXT: testb $8, %al
1913 ; SSE2-NEXT: je LBB16_8
1914 ; SSE2-NEXT: LBB16_7: ## %cond.load7
1915 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1916 ; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
1917 ; SSE2-NEXT: testb $16, %al
1918 ; SSE2-NEXT: je LBB16_10
1919 ; SSE2-NEXT: LBB16_9: ## %cond.load10
1920 ; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3]
1921 ; SSE2-NEXT: testb $32, %al
1922 ; SSE2-NEXT: je LBB16_12
1923 ; SSE2-NEXT: LBB16_11: ## %cond.load13
1924 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1925 ; SSE2-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
1926 ; SSE2-NEXT: testb $64, %al
1927 ; SSE2-NEXT: je LBB16_14
1928 ; SSE2-NEXT: LBB16_13: ## %cond.load16
1929 ; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
1930 ; SSE2-NEXT: testb $-128, %al
1931 ; SSE2-NEXT: jne LBB16_15
1932 ; SSE2-NEXT: jmp LBB16_16
1934 ; SSE42-LABEL: load_v8i64_v8i64:
1936 ; SSE42-NEXT: movdqa %xmm7, %xmm8
1937 ; SSE42-NEXT: pxor %xmm7, %xmm7
1938 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm3
1939 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm2
1940 ; SSE42-NEXT: packssdw %xmm3, %xmm2
1941 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm1
1942 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
1943 ; SSE42-NEXT: packssdw %xmm1, %xmm0
1944 ; SSE42-NEXT: packssdw %xmm2, %xmm0
1945 ; SSE42-NEXT: packsswb %xmm0, %xmm0
1946 ; SSE42-NEXT: pmovmskb %xmm0, %eax
1947 ; SSE42-NEXT: testb $1, %al
1948 ; SSE42-NEXT: jne LBB16_1
1949 ; SSE42-NEXT: ## %bb.2: ## %else
1950 ; SSE42-NEXT: testb $2, %al
1951 ; SSE42-NEXT: jne LBB16_3
1952 ; SSE42-NEXT: LBB16_4: ## %else2
1953 ; SSE42-NEXT: testb $4, %al
1954 ; SSE42-NEXT: jne LBB16_5
1955 ; SSE42-NEXT: LBB16_6: ## %else5
1956 ; SSE42-NEXT: testb $8, %al
1957 ; SSE42-NEXT: jne LBB16_7
1958 ; SSE42-NEXT: LBB16_8: ## %else8
1959 ; SSE42-NEXT: testb $16, %al
1960 ; SSE42-NEXT: jne LBB16_9
1961 ; SSE42-NEXT: LBB16_10: ## %else11
1962 ; SSE42-NEXT: testb $32, %al
1963 ; SSE42-NEXT: jne LBB16_11
1964 ; SSE42-NEXT: LBB16_12: ## %else14
1965 ; SSE42-NEXT: testb $64, %al
1966 ; SSE42-NEXT: jne LBB16_13
1967 ; SSE42-NEXT: LBB16_14: ## %else17
1968 ; SSE42-NEXT: testb $-128, %al
1969 ; SSE42-NEXT: je LBB16_16
1970 ; SSE42-NEXT: LBB16_15: ## %cond.load19
1971 ; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm8
1972 ; SSE42-NEXT: LBB16_16: ## %else20
1973 ; SSE42-NEXT: movdqa %xmm4, %xmm0
1974 ; SSE42-NEXT: movdqa %xmm5, %xmm1
1975 ; SSE42-NEXT: movdqa %xmm6, %xmm2
1976 ; SSE42-NEXT: movdqa %xmm8, %xmm3
1978 ; SSE42-NEXT: LBB16_1: ## %cond.load
1979 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4
1980 ; SSE42-NEXT: testb $2, %al
1981 ; SSE42-NEXT: je LBB16_4
1982 ; SSE42-NEXT: LBB16_3: ## %cond.load1
1983 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm4
1984 ; SSE42-NEXT: testb $4, %al
1985 ; SSE42-NEXT: je LBB16_6
1986 ; SSE42-NEXT: LBB16_5: ## %cond.load4
1987 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm5
1988 ; SSE42-NEXT: testb $8, %al
1989 ; SSE42-NEXT: je LBB16_8
1990 ; SSE42-NEXT: LBB16_7: ## %cond.load7
1991 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm5
1992 ; SSE42-NEXT: testb $16, %al
1993 ; SSE42-NEXT: je LBB16_10
1994 ; SSE42-NEXT: LBB16_9: ## %cond.load10
1995 ; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm6
1996 ; SSE42-NEXT: testb $32, %al
1997 ; SSE42-NEXT: je LBB16_12
1998 ; SSE42-NEXT: LBB16_11: ## %cond.load13
1999 ; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm6
2000 ; SSE42-NEXT: testb $64, %al
2001 ; SSE42-NEXT: je LBB16_14
2002 ; SSE42-NEXT: LBB16_13: ## %cond.load16
2003 ; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm8
2004 ; SSE42-NEXT: testb $-128, %al
2005 ; SSE42-NEXT: jne LBB16_15
2006 ; SSE42-NEXT: jmp LBB16_16
2008 ; AVX1-LABEL: load_v8i64_v8i64:
2010 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
2011 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
2012 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
2013 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
2014 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
2015 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
2016 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
2017 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
2018 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
2019 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
2020 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
2021 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
2022 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
2025 ; AVX2-LABEL: load_v8i64_v8i64:
2027 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
2028 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
2029 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
2030 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
2031 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
2032 ; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm2
2033 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
2036 ; AVX512-LABEL: load_v8i64_v8i64:
2038 ; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
2039 ; AVX512-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
2041 %mask = icmp eq <8 x i64> %trigger, zeroinitializer
2042 %res = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
2050 define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
2051 ; SSE2-LABEL: load_v2i32_v2i32:
2053 ; SSE2-NEXT: pxor %xmm2, %xmm2
2054 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
2055 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
2056 ; SSE2-NEXT: movmskpd %xmm0, %eax
2057 ; SSE2-NEXT: testb $1, %al
2058 ; SSE2-NEXT: jne LBB17_1
2059 ; SSE2-NEXT: ## %bb.2: ## %else
2060 ; SSE2-NEXT: testb $2, %al
2061 ; SSE2-NEXT: jne LBB17_3
2062 ; SSE2-NEXT: LBB17_4: ## %else2
2063 ; SSE2-NEXT: movaps %xmm1, %xmm0
2065 ; SSE2-NEXT: LBB17_1: ## %cond.load
2066 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2067 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2068 ; SSE2-NEXT: testb $2, %al
2069 ; SSE2-NEXT: je LBB17_4
2070 ; SSE2-NEXT: LBB17_3: ## %cond.load1
2071 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2072 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
2073 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2074 ; SSE2-NEXT: movaps %xmm0, %xmm1
2075 ; SSE2-NEXT: movaps %xmm1, %xmm0
2078 ; SSE42-LABEL: load_v2i32_v2i32:
2080 ; SSE42-NEXT: pxor %xmm2, %xmm2
2081 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
2082 ; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
2083 ; SSE42-NEXT: movmskpd %xmm0, %eax
2084 ; SSE42-NEXT: testb $1, %al
2085 ; SSE42-NEXT: jne LBB17_1
2086 ; SSE42-NEXT: ## %bb.2: ## %else
2087 ; SSE42-NEXT: testb $2, %al
2088 ; SSE42-NEXT: jne LBB17_3
2089 ; SSE42-NEXT: LBB17_4: ## %else2
2090 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2092 ; SSE42-NEXT: LBB17_1: ## %cond.load
2093 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
2094 ; SSE42-NEXT: testb $2, %al
2095 ; SSE42-NEXT: je LBB17_4
2096 ; SSE42-NEXT: LBB17_3: ## %cond.load1
2097 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
2098 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2101 ; AVX1-LABEL: load_v2i32_v2i32:
2103 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2104 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2105 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2106 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
2107 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2110 ; AVX2-LABEL: load_v2i32_v2i32:
2112 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2113 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2114 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2115 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
2116 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2119 ; AVX512F-LABEL: load_v2i32_v2i32:
2120 ; AVX512F: ## %bb.0:
2121 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2122 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2123 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
2124 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
2125 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
2126 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2127 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
2128 ; AVX512F-NEXT: vzeroupper
2129 ; AVX512F-NEXT: retq
2131 ; AVX512VLDQ-LABEL: load_v2i32_v2i32:
2132 ; AVX512VLDQ: ## %bb.0:
2133 ; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
2134 ; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
2135 ; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
2136 ; AVX512VLDQ-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2137 ; AVX512VLDQ-NEXT: retq
2139 ; AVX512VLBW-LABEL: load_v2i32_v2i32:
2140 ; AVX512VLBW: ## %bb.0:
2141 ; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
2142 ; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
2143 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
2144 ; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2145 ; AVX512VLBW-NEXT: retq
2146 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
2147 %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
2151 define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
2152 ; SSE2-LABEL: load_v4i32_v4i32:
2154 ; SSE2-NEXT: pxor %xmm2, %xmm2
2155 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
2156 ; SSE2-NEXT: movmskps %xmm2, %eax
2157 ; SSE2-NEXT: testb $1, %al
2158 ; SSE2-NEXT: jne LBB18_1
2159 ; SSE2-NEXT: ## %bb.2: ## %else
2160 ; SSE2-NEXT: testb $2, %al
2161 ; SSE2-NEXT: jne LBB18_3
2162 ; SSE2-NEXT: LBB18_4: ## %else2
2163 ; SSE2-NEXT: testb $4, %al
2164 ; SSE2-NEXT: jne LBB18_5
2165 ; SSE2-NEXT: LBB18_6: ## %else5
2166 ; SSE2-NEXT: testb $8, %al
2167 ; SSE2-NEXT: jne LBB18_7
2168 ; SSE2-NEXT: LBB18_8: ## %else8
2169 ; SSE2-NEXT: movaps %xmm1, %xmm0
2171 ; SSE2-NEXT: LBB18_1: ## %cond.load
2172 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2173 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2174 ; SSE2-NEXT: testb $2, %al
2175 ; SSE2-NEXT: je LBB18_4
2176 ; SSE2-NEXT: LBB18_3: ## %cond.load1
2177 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2178 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
2179 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2180 ; SSE2-NEXT: movaps %xmm0, %xmm1
2181 ; SSE2-NEXT: testb $4, %al
2182 ; SSE2-NEXT: je LBB18_6
2183 ; SSE2-NEXT: LBB18_5: ## %cond.load4
2184 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2185 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
2186 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
2187 ; SSE2-NEXT: testb $8, %al
2188 ; SSE2-NEXT: je LBB18_8
2189 ; SSE2-NEXT: LBB18_7: ## %cond.load7
2190 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2191 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
2192 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2193 ; SSE2-NEXT: movaps %xmm1, %xmm0
2196 ; SSE42-LABEL: load_v4i32_v4i32:
2198 ; SSE42-NEXT: pxor %xmm2, %xmm2
2199 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
2200 ; SSE42-NEXT: movmskps %xmm2, %eax
2201 ; SSE42-NEXT: testb $1, %al
2202 ; SSE42-NEXT: jne LBB18_1
2203 ; SSE42-NEXT: ## %bb.2: ## %else
2204 ; SSE42-NEXT: testb $2, %al
2205 ; SSE42-NEXT: jne LBB18_3
2206 ; SSE42-NEXT: LBB18_4: ## %else2
2207 ; SSE42-NEXT: testb $4, %al
2208 ; SSE42-NEXT: jne LBB18_5
2209 ; SSE42-NEXT: LBB18_6: ## %else5
2210 ; SSE42-NEXT: testb $8, %al
2211 ; SSE42-NEXT: jne LBB18_7
2212 ; SSE42-NEXT: LBB18_8: ## %else8
2213 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2215 ; SSE42-NEXT: LBB18_1: ## %cond.load
2216 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
2217 ; SSE42-NEXT: testb $2, %al
2218 ; SSE42-NEXT: je LBB18_4
2219 ; SSE42-NEXT: LBB18_3: ## %cond.load1
2220 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
2221 ; SSE42-NEXT: testb $4, %al
2222 ; SSE42-NEXT: je LBB18_6
2223 ; SSE42-NEXT: LBB18_5: ## %cond.load4
2224 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1
2225 ; SSE42-NEXT: testb $8, %al
2226 ; SSE42-NEXT: je LBB18_8
2227 ; SSE42-NEXT: LBB18_7: ## %cond.load7
2228 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1
2229 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2232 ; AVX1-LABEL: load_v4i32_v4i32:
2234 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2235 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2236 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
2237 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2240 ; AVX2-LABEL: load_v4i32_v4i32:
2242 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2243 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2244 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
2245 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2248 ; AVX512F-LABEL: load_v4i32_v4i32:
2249 ; AVX512F: ## %bb.0:
2250 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2251 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2252 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
2253 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
2254 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
2255 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2256 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
2257 ; AVX512F-NEXT: vzeroupper
2258 ; AVX512F-NEXT: retq
2260 ; AVX512VL-LABEL: load_v4i32_v4i32:
2261 ; AVX512VL: ## %bb.0:
2262 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
2263 ; AVX512VL-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2264 ; AVX512VL-NEXT: retq
2265 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
2266 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
2270 define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
2271 ; SSE2-LABEL: load_v8i32_v8i1:
2273 ; SSE2-NEXT: psllw $15, %xmm0
2274 ; SSE2-NEXT: packsswb %xmm0, %xmm0
2275 ; SSE2-NEXT: pmovmskb %xmm0, %eax
2276 ; SSE2-NEXT: testb $1, %al
2277 ; SSE2-NEXT: jne LBB19_1
2278 ; SSE2-NEXT: ## %bb.2: ## %else
2279 ; SSE2-NEXT: testb $2, %al
2280 ; SSE2-NEXT: jne LBB19_3
2281 ; SSE2-NEXT: LBB19_4: ## %else2
2282 ; SSE2-NEXT: testb $4, %al
2283 ; SSE2-NEXT: jne LBB19_5
2284 ; SSE2-NEXT: LBB19_6: ## %else5
2285 ; SSE2-NEXT: testb $8, %al
2286 ; SSE2-NEXT: jne LBB19_7
2287 ; SSE2-NEXT: LBB19_8: ## %else8
2288 ; SSE2-NEXT: testb $16, %al
2289 ; SSE2-NEXT: jne LBB19_9
2290 ; SSE2-NEXT: LBB19_10: ## %else11
2291 ; SSE2-NEXT: testb $32, %al
2292 ; SSE2-NEXT: jne LBB19_11
2293 ; SSE2-NEXT: LBB19_12: ## %else14
2294 ; SSE2-NEXT: testb $64, %al
2295 ; SSE2-NEXT: jne LBB19_13
2296 ; SSE2-NEXT: LBB19_14: ## %else17
2297 ; SSE2-NEXT: testb $-128, %al
2298 ; SSE2-NEXT: je LBB19_16
2299 ; SSE2-NEXT: LBB19_15: ## %cond.load19
2300 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2301 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
2302 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
2303 ; SSE2-NEXT: LBB19_16: ## %else20
2304 ; SSE2-NEXT: movaps %xmm1, %xmm0
2305 ; SSE2-NEXT: movaps %xmm2, %xmm1
2307 ; SSE2-NEXT: LBB19_1: ## %cond.load
2308 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2309 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2310 ; SSE2-NEXT: testb $2, %al
2311 ; SSE2-NEXT: je LBB19_4
2312 ; SSE2-NEXT: LBB19_3: ## %cond.load1
2313 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2314 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
2315 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2316 ; SSE2-NEXT: movaps %xmm0, %xmm1
2317 ; SSE2-NEXT: testb $4, %al
2318 ; SSE2-NEXT: je LBB19_6
2319 ; SSE2-NEXT: LBB19_5: ## %cond.load4
2320 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2321 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
2322 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
2323 ; SSE2-NEXT: testb $8, %al
2324 ; SSE2-NEXT: je LBB19_8
2325 ; SSE2-NEXT: LBB19_7: ## %cond.load7
2326 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2327 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
2328 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2329 ; SSE2-NEXT: testb $16, %al
2330 ; SSE2-NEXT: je LBB19_10
2331 ; SSE2-NEXT: LBB19_9: ## %cond.load10
2332 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2333 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2334 ; SSE2-NEXT: testb $32, %al
2335 ; SSE2-NEXT: je LBB19_12
2336 ; SSE2-NEXT: LBB19_11: ## %cond.load13
2337 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2338 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
2339 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
2340 ; SSE2-NEXT: movaps %xmm0, %xmm2
2341 ; SSE2-NEXT: testb $64, %al
2342 ; SSE2-NEXT: je LBB19_14
2343 ; SSE2-NEXT: LBB19_13: ## %cond.load16
2344 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2345 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
2346 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
2347 ; SSE2-NEXT: testb $-128, %al
2348 ; SSE2-NEXT: jne LBB19_15
2349 ; SSE2-NEXT: jmp LBB19_16
2351 ; SSE42-LABEL: load_v8i32_v8i1:
2353 ; SSE42-NEXT: psllw $15, %xmm0
2354 ; SSE42-NEXT: packsswb %xmm0, %xmm0
2355 ; SSE42-NEXT: pmovmskb %xmm0, %eax
2356 ; SSE42-NEXT: testb $1, %al
2357 ; SSE42-NEXT: jne LBB19_1
2358 ; SSE42-NEXT: ## %bb.2: ## %else
2359 ; SSE42-NEXT: testb $2, %al
2360 ; SSE42-NEXT: jne LBB19_3
2361 ; SSE42-NEXT: LBB19_4: ## %else2
2362 ; SSE42-NEXT: testb $4, %al
2363 ; SSE42-NEXT: jne LBB19_5
2364 ; SSE42-NEXT: LBB19_6: ## %else5
2365 ; SSE42-NEXT: testb $8, %al
2366 ; SSE42-NEXT: jne LBB19_7
2367 ; SSE42-NEXT: LBB19_8: ## %else8
2368 ; SSE42-NEXT: testb $16, %al
2369 ; SSE42-NEXT: jne LBB19_9
2370 ; SSE42-NEXT: LBB19_10: ## %else11
2371 ; SSE42-NEXT: testb $32, %al
2372 ; SSE42-NEXT: jne LBB19_11
2373 ; SSE42-NEXT: LBB19_12: ## %else14
2374 ; SSE42-NEXT: testb $64, %al
2375 ; SSE42-NEXT: jne LBB19_13
2376 ; SSE42-NEXT: LBB19_14: ## %else17
2377 ; SSE42-NEXT: testb $-128, %al
2378 ; SSE42-NEXT: je LBB19_16
2379 ; SSE42-NEXT: LBB19_15: ## %cond.load19
2380 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm2
2381 ; SSE42-NEXT: LBB19_16: ## %else20
2382 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2383 ; SSE42-NEXT: movdqa %xmm2, %xmm1
2385 ; SSE42-NEXT: LBB19_1: ## %cond.load
2386 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
2387 ; SSE42-NEXT: testb $2, %al
2388 ; SSE42-NEXT: je LBB19_4
2389 ; SSE42-NEXT: LBB19_3: ## %cond.load1
2390 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
2391 ; SSE42-NEXT: testb $4, %al
2392 ; SSE42-NEXT: je LBB19_6
2393 ; SSE42-NEXT: LBB19_5: ## %cond.load4
2394 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1
2395 ; SSE42-NEXT: testb $8, %al
2396 ; SSE42-NEXT: je LBB19_8
2397 ; SSE42-NEXT: LBB19_7: ## %cond.load7
2398 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1
2399 ; SSE42-NEXT: testb $16, %al
2400 ; SSE42-NEXT: je LBB19_10
2401 ; SSE42-NEXT: LBB19_9: ## %cond.load10
2402 ; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm2
2403 ; SSE42-NEXT: testb $32, %al
2404 ; SSE42-NEXT: je LBB19_12
2405 ; SSE42-NEXT: LBB19_11: ## %cond.load13
2406 ; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm2
2407 ; SSE42-NEXT: testb $64, %al
2408 ; SSE42-NEXT: je LBB19_14
2409 ; SSE42-NEXT: LBB19_13: ## %cond.load16
2410 ; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm2
2411 ; SSE42-NEXT: testb $-128, %al
2412 ; SSE42-NEXT: jne LBB19_15
2413 ; SSE42-NEXT: jmp LBB19_16
2415 ; AVX1-LABEL: load_v8i32_v8i1:
2417 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2418 ; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
2419 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
2420 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2421 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
2422 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
2423 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2424 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
2425 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
2428 ; AVX2-LABEL: load_v8i32_v8i1:
2430 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2431 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
2432 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
2433 ; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2
2434 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
2437 ; AVX512F-LABEL: load_v8i32_v8i1:
2438 ; AVX512F: ## %bb.0:
2439 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2440 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
2441 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
2442 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
2443 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2444 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2445 ; AVX512F-NEXT: retq
2447 ; AVX512VLDQ-LABEL: load_v8i32_v8i1:
2448 ; AVX512VLDQ: ## %bb.0:
2449 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
2450 ; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
2451 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
2452 ; AVX512VLDQ-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
2453 ; AVX512VLDQ-NEXT: retq
2455 ; AVX512VLBW-LABEL: load_v8i32_v8i1:
2456 ; AVX512VLBW: ## %bb.0:
2457 ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
2458 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
2459 ; AVX512VLBW-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
2460 ; AVX512VLBW-NEXT: retq
2461 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> %dst)
2465 define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) {
2466 ; SSE2-LABEL: load_v8i32_v8i1_zero:
2468 ; SSE2-NEXT: psllw $15, %xmm0
2469 ; SSE2-NEXT: packsswb %xmm0, %xmm0
2470 ; SSE2-NEXT: pmovmskb %xmm0, %eax
2471 ; SSE2-NEXT: pxor %xmm0, %xmm0
2472 ; SSE2-NEXT: testb $1, %al
2473 ; SSE2-NEXT: xorps %xmm1, %xmm1
2474 ; SSE2-NEXT: jne LBB20_1
2475 ; SSE2-NEXT: ## %bb.2: ## %else
2476 ; SSE2-NEXT: testb $2, %al
2477 ; SSE2-NEXT: jne LBB20_3
2478 ; SSE2-NEXT: LBB20_4: ## %else2
2479 ; SSE2-NEXT: testb $4, %al
2480 ; SSE2-NEXT: jne LBB20_5
2481 ; SSE2-NEXT: LBB20_6: ## %else5
2482 ; SSE2-NEXT: testb $8, %al
2483 ; SSE2-NEXT: jne LBB20_7
2484 ; SSE2-NEXT: LBB20_8: ## %else8
2485 ; SSE2-NEXT: testb $16, %al
2486 ; SSE2-NEXT: jne LBB20_9
2487 ; SSE2-NEXT: LBB20_10: ## %else11
2488 ; SSE2-NEXT: testb $32, %al
2489 ; SSE2-NEXT: jne LBB20_11
2490 ; SSE2-NEXT: LBB20_12: ## %else14
2491 ; SSE2-NEXT: testb $64, %al
2492 ; SSE2-NEXT: jne LBB20_13
2493 ; SSE2-NEXT: LBB20_14: ## %else17
2494 ; SSE2-NEXT: testb $-128, %al
2495 ; SSE2-NEXT: jne LBB20_15
2496 ; SSE2-NEXT: LBB20_16: ## %else20
2498 ; SSE2-NEXT: LBB20_1: ## %cond.load
2499 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2500 ; SSE2-NEXT: xorps %xmm1, %xmm1
2501 ; SSE2-NEXT: testb $2, %al
2502 ; SSE2-NEXT: je LBB20_4
2503 ; SSE2-NEXT: LBB20_3: ## %cond.load1
2504 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2505 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
2506 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
2507 ; SSE2-NEXT: movaps %xmm2, %xmm0
2508 ; SSE2-NEXT: testb $4, %al
2509 ; SSE2-NEXT: je LBB20_6
2510 ; SSE2-NEXT: LBB20_5: ## %cond.load4
2511 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2512 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
2513 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
2514 ; SSE2-NEXT: testb $8, %al
2515 ; SSE2-NEXT: je LBB20_8
2516 ; SSE2-NEXT: LBB20_7: ## %cond.load7
2517 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2518 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
2519 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
2520 ; SSE2-NEXT: testb $16, %al
2521 ; SSE2-NEXT: je LBB20_10
2522 ; SSE2-NEXT: LBB20_9: ## %cond.load10
2523 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2524 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2525 ; SSE2-NEXT: testb $32, %al
2526 ; SSE2-NEXT: je LBB20_12
2527 ; SSE2-NEXT: LBB20_11: ## %cond.load13
2528 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2529 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
2530 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
2531 ; SSE2-NEXT: movaps %xmm2, %xmm1
2532 ; SSE2-NEXT: testb $64, %al
2533 ; SSE2-NEXT: je LBB20_14
2534 ; SSE2-NEXT: LBB20_13: ## %cond.load16
2535 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2536 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
2537 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
2538 ; SSE2-NEXT: testb $-128, %al
2539 ; SSE2-NEXT: je LBB20_16
2540 ; SSE2-NEXT: LBB20_15: ## %cond.load19
2541 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2542 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
2543 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
2546 ; SSE42-LABEL: load_v8i32_v8i1_zero:
2548 ; SSE42-NEXT: psllw $15, %xmm0
2549 ; SSE42-NEXT: packsswb %xmm0, %xmm0
2550 ; SSE42-NEXT: pmovmskb %xmm0, %eax
2551 ; SSE42-NEXT: pxor %xmm0, %xmm0
2552 ; SSE42-NEXT: testb $1, %al
2553 ; SSE42-NEXT: pxor %xmm1, %xmm1
2554 ; SSE42-NEXT: jne LBB20_1
2555 ; SSE42-NEXT: ## %bb.2: ## %else
2556 ; SSE42-NEXT: testb $2, %al
2557 ; SSE42-NEXT: jne LBB20_3
2558 ; SSE42-NEXT: LBB20_4: ## %else2
2559 ; SSE42-NEXT: testb $4, %al
2560 ; SSE42-NEXT: jne LBB20_5
2561 ; SSE42-NEXT: LBB20_6: ## %else5
2562 ; SSE42-NEXT: testb $8, %al
2563 ; SSE42-NEXT: jne LBB20_7
2564 ; SSE42-NEXT: LBB20_8: ## %else8
2565 ; SSE42-NEXT: testb $16, %al
2566 ; SSE42-NEXT: jne LBB20_9
2567 ; SSE42-NEXT: LBB20_10: ## %else11
2568 ; SSE42-NEXT: testb $32, %al
2569 ; SSE42-NEXT: jne LBB20_11
2570 ; SSE42-NEXT: LBB20_12: ## %else14
2571 ; SSE42-NEXT: testb $64, %al
2572 ; SSE42-NEXT: jne LBB20_13
2573 ; SSE42-NEXT: LBB20_14: ## %else17
2574 ; SSE42-NEXT: testb $-128, %al
2575 ; SSE42-NEXT: jne LBB20_15
2576 ; SSE42-NEXT: LBB20_16: ## %else20
2578 ; SSE42-NEXT: LBB20_1: ## %cond.load
2579 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2580 ; SSE42-NEXT: pxor %xmm1, %xmm1
2581 ; SSE42-NEXT: testb $2, %al
2582 ; SSE42-NEXT: je LBB20_4
2583 ; SSE42-NEXT: LBB20_3: ## %cond.load1
2584 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
2585 ; SSE42-NEXT: testb $4, %al
2586 ; SSE42-NEXT: je LBB20_6
2587 ; SSE42-NEXT: LBB20_5: ## %cond.load4
2588 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
2589 ; SSE42-NEXT: testb $8, %al
2590 ; SSE42-NEXT: je LBB20_8
2591 ; SSE42-NEXT: LBB20_7: ## %cond.load7
2592 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0
2593 ; SSE42-NEXT: testb $16, %al
2594 ; SSE42-NEXT: je LBB20_10
2595 ; SSE42-NEXT: LBB20_9: ## %cond.load10
2596 ; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm1
2597 ; SSE42-NEXT: testb $32, %al
2598 ; SSE42-NEXT: je LBB20_12
2599 ; SSE42-NEXT: LBB20_11: ## %cond.load13
2600 ; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm1
2601 ; SSE42-NEXT: testb $64, %al
2602 ; SSE42-NEXT: je LBB20_14
2603 ; SSE42-NEXT: LBB20_13: ## %cond.load16
2604 ; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm1
2605 ; SSE42-NEXT: testb $-128, %al
2606 ; SSE42-NEXT: je LBB20_16
2607 ; SSE42-NEXT: LBB20_15: ## %cond.load19
2608 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1
2611 ; AVX1-LABEL: load_v8i32_v8i1_zero:
2613 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2614 ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
2615 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
2616 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2617 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
2618 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
2619 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2620 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
2623 ; AVX2-LABEL: load_v8i32_v8i1_zero:
2625 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2626 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
2627 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
2628 ; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
2631 ; AVX512F-LABEL: load_v8i32_v8i1_zero:
2632 ; AVX512F: ## %bb.0:
2633 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
2634 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
2635 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
2636 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
2637 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2638 ; AVX512F-NEXT: retq
2640 ; AVX512VLDQ-LABEL: load_v8i32_v8i1_zero:
2641 ; AVX512VLDQ: ## %bb.0:
2642 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
2643 ; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
2644 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
2645 ; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
2646 ; AVX512VLDQ-NEXT: retq
2648 ; AVX512VLBW-LABEL: load_v8i32_v8i1_zero:
2649 ; AVX512VLBW: ## %bb.0:
2650 ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
2651 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
2652 ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
2653 ; AVX512VLBW-NEXT: retq
2654 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
2662 define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i16> %dst) {
2663 ; SSE-LABEL: load_v8i16_v8i16:
2665 ; SSE-NEXT: packsswb %xmm0, %xmm0
2666 ; SSE-NEXT: pmovmskb %xmm0, %eax
2667 ; SSE-NEXT: testb $1, %al
2668 ; SSE-NEXT: jne LBB21_1
2669 ; SSE-NEXT: ## %bb.2: ## %else
2670 ; SSE-NEXT: testb $2, %al
2671 ; SSE-NEXT: jne LBB21_3
2672 ; SSE-NEXT: LBB21_4: ## %else2
2673 ; SSE-NEXT: testb $4, %al
2674 ; SSE-NEXT: jne LBB21_5
2675 ; SSE-NEXT: LBB21_6: ## %else5
2676 ; SSE-NEXT: testb $8, %al
2677 ; SSE-NEXT: jne LBB21_7
2678 ; SSE-NEXT: LBB21_8: ## %else8
2679 ; SSE-NEXT: testb $16, %al
2680 ; SSE-NEXT: jne LBB21_9
2681 ; SSE-NEXT: LBB21_10: ## %else11
2682 ; SSE-NEXT: testb $32, %al
2683 ; SSE-NEXT: jne LBB21_11
2684 ; SSE-NEXT: LBB21_12: ## %else14
2685 ; SSE-NEXT: testb $64, %al
2686 ; SSE-NEXT: jne LBB21_13
2687 ; SSE-NEXT: LBB21_14: ## %else17
2688 ; SSE-NEXT: testb $-128, %al
2689 ; SSE-NEXT: jne LBB21_15
2690 ; SSE-NEXT: LBB21_16: ## %else20
2691 ; SSE-NEXT: movdqa %xmm1, %xmm0
2693 ; SSE-NEXT: LBB21_1: ## %cond.load
2694 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm1
2695 ; SSE-NEXT: testb $2, %al
2696 ; SSE-NEXT: je LBB21_4
2697 ; SSE-NEXT: LBB21_3: ## %cond.load1
2698 ; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm1
2699 ; SSE-NEXT: testb $4, %al
2700 ; SSE-NEXT: je LBB21_6
2701 ; SSE-NEXT: LBB21_5: ## %cond.load4
2702 ; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm1
2703 ; SSE-NEXT: testb $8, %al
2704 ; SSE-NEXT: je LBB21_8
2705 ; SSE-NEXT: LBB21_7: ## %cond.load7
2706 ; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm1
2707 ; SSE-NEXT: testb $16, %al
2708 ; SSE-NEXT: je LBB21_10
2709 ; SSE-NEXT: LBB21_9: ## %cond.load10
2710 ; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm1
2711 ; SSE-NEXT: testb $32, %al
2712 ; SSE-NEXT: je LBB21_12
2713 ; SSE-NEXT: LBB21_11: ## %cond.load13
2714 ; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm1
2715 ; SSE-NEXT: testb $64, %al
2716 ; SSE-NEXT: je LBB21_14
2717 ; SSE-NEXT: LBB21_13: ## %cond.load16
2718 ; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm1
2719 ; SSE-NEXT: testb $-128, %al
2720 ; SSE-NEXT: je LBB21_16
2721 ; SSE-NEXT: LBB21_15: ## %cond.load19
2722 ; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm1
2723 ; SSE-NEXT: movdqa %xmm1, %xmm0
2726 ; AVX1OR2-LABEL: load_v8i16_v8i16:
2727 ; AVX1OR2: ## %bb.0:
2728 ; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
2729 ; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
2730 ; AVX1OR2-NEXT: testb $1, %al
2731 ; AVX1OR2-NEXT: jne LBB21_1
2732 ; AVX1OR2-NEXT: ## %bb.2: ## %else
2733 ; AVX1OR2-NEXT: testb $2, %al
2734 ; AVX1OR2-NEXT: jne LBB21_3
2735 ; AVX1OR2-NEXT: LBB21_4: ## %else2
2736 ; AVX1OR2-NEXT: testb $4, %al
2737 ; AVX1OR2-NEXT: jne LBB21_5
2738 ; AVX1OR2-NEXT: LBB21_6: ## %else5
2739 ; AVX1OR2-NEXT: testb $8, %al
2740 ; AVX1OR2-NEXT: jne LBB21_7
2741 ; AVX1OR2-NEXT: LBB21_8: ## %else8
2742 ; AVX1OR2-NEXT: testb $16, %al
2743 ; AVX1OR2-NEXT: jne LBB21_9
2744 ; AVX1OR2-NEXT: LBB21_10: ## %else11
2745 ; AVX1OR2-NEXT: testb $32, %al
2746 ; AVX1OR2-NEXT: jne LBB21_11
2747 ; AVX1OR2-NEXT: LBB21_12: ## %else14
2748 ; AVX1OR2-NEXT: testb $64, %al
2749 ; AVX1OR2-NEXT: jne LBB21_13
2750 ; AVX1OR2-NEXT: LBB21_14: ## %else17
2751 ; AVX1OR2-NEXT: testb $-128, %al
2752 ; AVX1OR2-NEXT: jne LBB21_15
2753 ; AVX1OR2-NEXT: LBB21_16: ## %else20
2754 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
2755 ; AVX1OR2-NEXT: retq
2756 ; AVX1OR2-NEXT: LBB21_1: ## %cond.load
2757 ; AVX1OR2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
2758 ; AVX1OR2-NEXT: testb $2, %al
2759 ; AVX1OR2-NEXT: je LBB21_4
2760 ; AVX1OR2-NEXT: LBB21_3: ## %cond.load1
2761 ; AVX1OR2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
2762 ; AVX1OR2-NEXT: testb $4, %al
2763 ; AVX1OR2-NEXT: je LBB21_6
2764 ; AVX1OR2-NEXT: LBB21_5: ## %cond.load4
2765 ; AVX1OR2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
2766 ; AVX1OR2-NEXT: testb $8, %al
2767 ; AVX1OR2-NEXT: je LBB21_8
2768 ; AVX1OR2-NEXT: LBB21_7: ## %cond.load7
2769 ; AVX1OR2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
2770 ; AVX1OR2-NEXT: testb $16, %al
2771 ; AVX1OR2-NEXT: je LBB21_10
2772 ; AVX1OR2-NEXT: LBB21_9: ## %cond.load10
2773 ; AVX1OR2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
2774 ; AVX1OR2-NEXT: testb $32, %al
2775 ; AVX1OR2-NEXT: je LBB21_12
2776 ; AVX1OR2-NEXT: LBB21_11: ## %cond.load13
2777 ; AVX1OR2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
2778 ; AVX1OR2-NEXT: testb $64, %al
2779 ; AVX1OR2-NEXT: je LBB21_14
2780 ; AVX1OR2-NEXT: LBB21_13: ## %cond.load16
2781 ; AVX1OR2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
2782 ; AVX1OR2-NEXT: testb $-128, %al
2783 ; AVX1OR2-NEXT: je LBB21_16
2784 ; AVX1OR2-NEXT: LBB21_15: ## %cond.load19
2785 ; AVX1OR2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
2786 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
2787 ; AVX1OR2-NEXT: retq
2789 ; AVX512F-LABEL: load_v8i16_v8i16:
2790 ; AVX512F: ## %bb.0:
2791 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2792 ; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
2793 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
2794 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
2795 ; AVX512F-NEXT: kmovw %k0, %eax
2796 ; AVX512F-NEXT: testb $1, %al
2797 ; AVX512F-NEXT: jne LBB21_1
2798 ; AVX512F-NEXT: ## %bb.2: ## %else
2799 ; AVX512F-NEXT: testb $2, %al
2800 ; AVX512F-NEXT: jne LBB21_3
2801 ; AVX512F-NEXT: LBB21_4: ## %else2
2802 ; AVX512F-NEXT: testb $4, %al
2803 ; AVX512F-NEXT: jne LBB21_5
2804 ; AVX512F-NEXT: LBB21_6: ## %else5
2805 ; AVX512F-NEXT: testb $8, %al
2806 ; AVX512F-NEXT: jne LBB21_7
2807 ; AVX512F-NEXT: LBB21_8: ## %else8
2808 ; AVX512F-NEXT: testb $16, %al
2809 ; AVX512F-NEXT: jne LBB21_9
2810 ; AVX512F-NEXT: LBB21_10: ## %else11
2811 ; AVX512F-NEXT: testb $32, %al
2812 ; AVX512F-NEXT: jne LBB21_11
2813 ; AVX512F-NEXT: LBB21_12: ## %else14
2814 ; AVX512F-NEXT: testb $64, %al
2815 ; AVX512F-NEXT: jne LBB21_13
2816 ; AVX512F-NEXT: LBB21_14: ## %else17
2817 ; AVX512F-NEXT: testb $-128, %al
2818 ; AVX512F-NEXT: jne LBB21_15
2819 ; AVX512F-NEXT: LBB21_16: ## %else20
2820 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
2821 ; AVX512F-NEXT: vzeroupper
2822 ; AVX512F-NEXT: retq
2823 ; AVX512F-NEXT: LBB21_1: ## %cond.load
2824 ; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
2825 ; AVX512F-NEXT: testb $2, %al
2826 ; AVX512F-NEXT: je LBB21_4
2827 ; AVX512F-NEXT: LBB21_3: ## %cond.load1
2828 ; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
2829 ; AVX512F-NEXT: testb $4, %al
2830 ; AVX512F-NEXT: je LBB21_6
2831 ; AVX512F-NEXT: LBB21_5: ## %cond.load4
2832 ; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
2833 ; AVX512F-NEXT: testb $8, %al
2834 ; AVX512F-NEXT: je LBB21_8
2835 ; AVX512F-NEXT: LBB21_7: ## %cond.load7
2836 ; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
2837 ; AVX512F-NEXT: testb $16, %al
2838 ; AVX512F-NEXT: je LBB21_10
2839 ; AVX512F-NEXT: LBB21_9: ## %cond.load10
2840 ; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
2841 ; AVX512F-NEXT: testb $32, %al
2842 ; AVX512F-NEXT: je LBB21_12
2843 ; AVX512F-NEXT: LBB21_11: ## %cond.load13
2844 ; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
2845 ; AVX512F-NEXT: testb $64, %al
2846 ; AVX512F-NEXT: je LBB21_14
2847 ; AVX512F-NEXT: LBB21_13: ## %cond.load16
2848 ; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
2849 ; AVX512F-NEXT: testb $-128, %al
2850 ; AVX512F-NEXT: je LBB21_16
2851 ; AVX512F-NEXT: LBB21_15: ## %cond.load19
2852 ; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
2853 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
2854 ; AVX512F-NEXT: vzeroupper
2855 ; AVX512F-NEXT: retq
2857 ; AVX512VLDQ-LABEL: load_v8i16_v8i16:
2858 ; AVX512VLDQ: ## %bb.0:
2859 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
2860 ; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
2861 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
2862 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0
2863 ; AVX512VLDQ-NEXT: kmovw %k0, %eax
2864 ; AVX512VLDQ-NEXT: testb $1, %al
2865 ; AVX512VLDQ-NEXT: jne LBB21_1
2866 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
2867 ; AVX512VLDQ-NEXT: testb $2, %al
2868 ; AVX512VLDQ-NEXT: jne LBB21_3
2869 ; AVX512VLDQ-NEXT: LBB21_4: ## %else2
2870 ; AVX512VLDQ-NEXT: testb $4, %al
2871 ; AVX512VLDQ-NEXT: jne LBB21_5
2872 ; AVX512VLDQ-NEXT: LBB21_6: ## %else5
2873 ; AVX512VLDQ-NEXT: testb $8, %al
2874 ; AVX512VLDQ-NEXT: jne LBB21_7
2875 ; AVX512VLDQ-NEXT: LBB21_8: ## %else8
2876 ; AVX512VLDQ-NEXT: testb $16, %al
2877 ; AVX512VLDQ-NEXT: jne LBB21_9
2878 ; AVX512VLDQ-NEXT: LBB21_10: ## %else11
2879 ; AVX512VLDQ-NEXT: testb $32, %al
2880 ; AVX512VLDQ-NEXT: jne LBB21_11
2881 ; AVX512VLDQ-NEXT: LBB21_12: ## %else14
2882 ; AVX512VLDQ-NEXT: testb $64, %al
2883 ; AVX512VLDQ-NEXT: jne LBB21_13
2884 ; AVX512VLDQ-NEXT: LBB21_14: ## %else17
2885 ; AVX512VLDQ-NEXT: testb $-128, %al
2886 ; AVX512VLDQ-NEXT: jne LBB21_15
2887 ; AVX512VLDQ-NEXT: LBB21_16: ## %else20
2888 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
2889 ; AVX512VLDQ-NEXT: vzeroupper
2890 ; AVX512VLDQ-NEXT: retq
2891 ; AVX512VLDQ-NEXT: LBB21_1: ## %cond.load
2892 ; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
2893 ; AVX512VLDQ-NEXT: testb $2, %al
2894 ; AVX512VLDQ-NEXT: je LBB21_4
2895 ; AVX512VLDQ-NEXT: LBB21_3: ## %cond.load1
2896 ; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
2897 ; AVX512VLDQ-NEXT: testb $4, %al
2898 ; AVX512VLDQ-NEXT: je LBB21_6
2899 ; AVX512VLDQ-NEXT: LBB21_5: ## %cond.load4
2900 ; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
2901 ; AVX512VLDQ-NEXT: testb $8, %al
2902 ; AVX512VLDQ-NEXT: je LBB21_8
2903 ; AVX512VLDQ-NEXT: LBB21_7: ## %cond.load7
2904 ; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
2905 ; AVX512VLDQ-NEXT: testb $16, %al
2906 ; AVX512VLDQ-NEXT: je LBB21_10
2907 ; AVX512VLDQ-NEXT: LBB21_9: ## %cond.load10
2908 ; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
2909 ; AVX512VLDQ-NEXT: testb $32, %al
2910 ; AVX512VLDQ-NEXT: je LBB21_12
2911 ; AVX512VLDQ-NEXT: LBB21_11: ## %cond.load13
2912 ; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
2913 ; AVX512VLDQ-NEXT: testb $64, %al
2914 ; AVX512VLDQ-NEXT: je LBB21_14
2915 ; AVX512VLDQ-NEXT: LBB21_13: ## %cond.load16
2916 ; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
2917 ; AVX512VLDQ-NEXT: testb $-128, %al
2918 ; AVX512VLDQ-NEXT: je LBB21_16
2919 ; AVX512VLDQ-NEXT: LBB21_15: ## %cond.load19
2920 ; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
2921 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
2922 ; AVX512VLDQ-NEXT: vzeroupper
2923 ; AVX512VLDQ-NEXT: retq
2925 ; AVX512VLBW-LABEL: load_v8i16_v8i16:
2926 ; AVX512VLBW: ## %bb.0:
2927 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
2928 ; AVX512VLBW-NEXT: vpblendmw (%rdi), %xmm1, %xmm0 {%k1}
2929 ; AVX512VLBW-NEXT: retq
2930 %mask = icmp slt <8 x i16> %trigger, zeroinitializer
2931 %res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1> %mask, <8 x i16> %dst)
2935 define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <16 x i16> %dst) {
2936 ; SSE-LABEL: load_v16i16_v16i16:
2938 ; SSE-NEXT: packsswb %xmm1, %xmm0
2939 ; SSE-NEXT: pmovmskb %xmm0, %eax
2940 ; SSE-NEXT: testb $1, %al
2941 ; SSE-NEXT: jne LBB22_1
2942 ; SSE-NEXT: ## %bb.2: ## %else
2943 ; SSE-NEXT: testb $2, %al
2944 ; SSE-NEXT: jne LBB22_3
2945 ; SSE-NEXT: LBB22_4: ## %else2
2946 ; SSE-NEXT: testb $4, %al
2947 ; SSE-NEXT: jne LBB22_5
2948 ; SSE-NEXT: LBB22_6: ## %else5
2949 ; SSE-NEXT: testb $8, %al
2950 ; SSE-NEXT: jne LBB22_7
2951 ; SSE-NEXT: LBB22_8: ## %else8
2952 ; SSE-NEXT: testb $16, %al
2953 ; SSE-NEXT: jne LBB22_9
2954 ; SSE-NEXT: LBB22_10: ## %else11
2955 ; SSE-NEXT: testb $32, %al
2956 ; SSE-NEXT: jne LBB22_11
2957 ; SSE-NEXT: LBB22_12: ## %else14
2958 ; SSE-NEXT: testb $64, %al
2959 ; SSE-NEXT: jne LBB22_13
2960 ; SSE-NEXT: LBB22_14: ## %else17
2961 ; SSE-NEXT: testb $-128, %al
2962 ; SSE-NEXT: jne LBB22_15
2963 ; SSE-NEXT: LBB22_16: ## %else20
2964 ; SSE-NEXT: testl $256, %eax ## imm = 0x100
2965 ; SSE-NEXT: jne LBB22_17
2966 ; SSE-NEXT: LBB22_18: ## %else23
2967 ; SSE-NEXT: testl $512, %eax ## imm = 0x200
2968 ; SSE-NEXT: jne LBB22_19
2969 ; SSE-NEXT: LBB22_20: ## %else26
2970 ; SSE-NEXT: testl $1024, %eax ## imm = 0x400
2971 ; SSE-NEXT: jne LBB22_21
2972 ; SSE-NEXT: LBB22_22: ## %else29
2973 ; SSE-NEXT: testl $2048, %eax ## imm = 0x800
2974 ; SSE-NEXT: jne LBB22_23
2975 ; SSE-NEXT: LBB22_24: ## %else32
2976 ; SSE-NEXT: testl $4096, %eax ## imm = 0x1000
2977 ; SSE-NEXT: jne LBB22_25
2978 ; SSE-NEXT: LBB22_26: ## %else35
2979 ; SSE-NEXT: testl $8192, %eax ## imm = 0x2000
2980 ; SSE-NEXT: jne LBB22_27
2981 ; SSE-NEXT: LBB22_28: ## %else38
2982 ; SSE-NEXT: testl $16384, %eax ## imm = 0x4000
2983 ; SSE-NEXT: jne LBB22_29
2984 ; SSE-NEXT: LBB22_30: ## %else41
2985 ; SSE-NEXT: testl $32768, %eax ## imm = 0x8000
2986 ; SSE-NEXT: je LBB22_32
2987 ; SSE-NEXT: LBB22_31: ## %cond.load43
2988 ; SSE-NEXT: pinsrw $7, 30(%rdi), %xmm3
2989 ; SSE-NEXT: LBB22_32: ## %else44
2990 ; SSE-NEXT: movdqa %xmm2, %xmm0
2991 ; SSE-NEXT: movdqa %xmm3, %xmm1
2993 ; SSE-NEXT: LBB22_1: ## %cond.load
2994 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm2
2995 ; SSE-NEXT: testb $2, %al
2996 ; SSE-NEXT: je LBB22_4
2997 ; SSE-NEXT: LBB22_3: ## %cond.load1
2998 ; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm2
2999 ; SSE-NEXT: testb $4, %al
3000 ; SSE-NEXT: je LBB22_6
3001 ; SSE-NEXT: LBB22_5: ## %cond.load4
3002 ; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm2
3003 ; SSE-NEXT: testb $8, %al
3004 ; SSE-NEXT: je LBB22_8
3005 ; SSE-NEXT: LBB22_7: ## %cond.load7
3006 ; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm2
3007 ; SSE-NEXT: testb $16, %al
3008 ; SSE-NEXT: je LBB22_10
3009 ; SSE-NEXT: LBB22_9: ## %cond.load10
3010 ; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm2
3011 ; SSE-NEXT: testb $32, %al
3012 ; SSE-NEXT: je LBB22_12
3013 ; SSE-NEXT: LBB22_11: ## %cond.load13
3014 ; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm2
3015 ; SSE-NEXT: testb $64, %al
3016 ; SSE-NEXT: je LBB22_14
3017 ; SSE-NEXT: LBB22_13: ## %cond.load16
3018 ; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm2
3019 ; SSE-NEXT: testb $-128, %al
3020 ; SSE-NEXT: je LBB22_16
3021 ; SSE-NEXT: LBB22_15: ## %cond.load19
3022 ; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm2
3023 ; SSE-NEXT: testl $256, %eax ## imm = 0x100
3024 ; SSE-NEXT: je LBB22_18
3025 ; SSE-NEXT: LBB22_17: ## %cond.load22
3026 ; SSE-NEXT: pinsrw $0, 16(%rdi), %xmm3
3027 ; SSE-NEXT: testl $512, %eax ## imm = 0x200
3028 ; SSE-NEXT: je LBB22_20
3029 ; SSE-NEXT: LBB22_19: ## %cond.load25
3030 ; SSE-NEXT: pinsrw $1, 18(%rdi), %xmm3
3031 ; SSE-NEXT: testl $1024, %eax ## imm = 0x400
3032 ; SSE-NEXT: je LBB22_22
3033 ; SSE-NEXT: LBB22_21: ## %cond.load28
3034 ; SSE-NEXT: pinsrw $2, 20(%rdi), %xmm3
3035 ; SSE-NEXT: testl $2048, %eax ## imm = 0x800
3036 ; SSE-NEXT: je LBB22_24
3037 ; SSE-NEXT: LBB22_23: ## %cond.load31
3038 ; SSE-NEXT: pinsrw $3, 22(%rdi), %xmm3
3039 ; SSE-NEXT: testl $4096, %eax ## imm = 0x1000
3040 ; SSE-NEXT: je LBB22_26
3041 ; SSE-NEXT: LBB22_25: ## %cond.load34
3042 ; SSE-NEXT: pinsrw $4, 24(%rdi), %xmm3
3043 ; SSE-NEXT: testl $8192, %eax ## imm = 0x2000
3044 ; SSE-NEXT: je LBB22_28
3045 ; SSE-NEXT: LBB22_27: ## %cond.load37
3046 ; SSE-NEXT: pinsrw $5, 26(%rdi), %xmm3
3047 ; SSE-NEXT: testl $16384, %eax ## imm = 0x4000
3048 ; SSE-NEXT: je LBB22_30
3049 ; SSE-NEXT: LBB22_29: ## %cond.load40
3050 ; SSE-NEXT: pinsrw $6, 28(%rdi), %xmm3
3051 ; SSE-NEXT: testl $32768, %eax ## imm = 0x8000
3052 ; SSE-NEXT: jne LBB22_31
3053 ; SSE-NEXT: jmp LBB22_32
3055 ; AVX1-LABEL: load_v16i16_v16i16:
3057 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3058 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
3059 ; AVX1-NEXT: vpmovmskb %xmm0, %eax
3060 ; AVX1-NEXT: testb $1, %al
3061 ; AVX1-NEXT: jne LBB22_1
3062 ; AVX1-NEXT: ## %bb.2: ## %else
3063 ; AVX1-NEXT: testb $2, %al
3064 ; AVX1-NEXT: jne LBB22_3
3065 ; AVX1-NEXT: LBB22_4: ## %else2
3066 ; AVX1-NEXT: testb $4, %al
3067 ; AVX1-NEXT: jne LBB22_5
3068 ; AVX1-NEXT: LBB22_6: ## %else5
3069 ; AVX1-NEXT: testb $8, %al
3070 ; AVX1-NEXT: jne LBB22_7
3071 ; AVX1-NEXT: LBB22_8: ## %else8
3072 ; AVX1-NEXT: testb $16, %al
3073 ; AVX1-NEXT: jne LBB22_9
3074 ; AVX1-NEXT: LBB22_10: ## %else11
3075 ; AVX1-NEXT: testb $32, %al
3076 ; AVX1-NEXT: jne LBB22_11
3077 ; AVX1-NEXT: LBB22_12: ## %else14
3078 ; AVX1-NEXT: testb $64, %al
3079 ; AVX1-NEXT: jne LBB22_13
3080 ; AVX1-NEXT: LBB22_14: ## %else17
3081 ; AVX1-NEXT: testb $-128, %al
3082 ; AVX1-NEXT: jne LBB22_15
3083 ; AVX1-NEXT: LBB22_16: ## %else20
3084 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
3085 ; AVX1-NEXT: jne LBB22_17
3086 ; AVX1-NEXT: LBB22_18: ## %else23
3087 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
3088 ; AVX1-NEXT: jne LBB22_19
3089 ; AVX1-NEXT: LBB22_20: ## %else26
3090 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
3091 ; AVX1-NEXT: jne LBB22_21
3092 ; AVX1-NEXT: LBB22_22: ## %else29
3093 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
3094 ; AVX1-NEXT: jne LBB22_23
3095 ; AVX1-NEXT: LBB22_24: ## %else32
3096 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
3097 ; AVX1-NEXT: jne LBB22_25
3098 ; AVX1-NEXT: LBB22_26: ## %else35
3099 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
3100 ; AVX1-NEXT: jne LBB22_27
3101 ; AVX1-NEXT: LBB22_28: ## %else38
3102 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
3103 ; AVX1-NEXT: jne LBB22_29
3104 ; AVX1-NEXT: LBB22_30: ## %else41
3105 ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
3106 ; AVX1-NEXT: jne LBB22_31
3107 ; AVX1-NEXT: LBB22_32: ## %else44
3108 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
3110 ; AVX1-NEXT: LBB22_1: ## %cond.load
3111 ; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3112 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3113 ; AVX1-NEXT: testb $2, %al
3114 ; AVX1-NEXT: je LBB22_4
3115 ; AVX1-NEXT: LBB22_3: ## %cond.load1
3116 ; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3117 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3118 ; AVX1-NEXT: testb $4, %al
3119 ; AVX1-NEXT: je LBB22_6
3120 ; AVX1-NEXT: LBB22_5: ## %cond.load4
3121 ; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3122 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3123 ; AVX1-NEXT: testb $8, %al
3124 ; AVX1-NEXT: je LBB22_8
3125 ; AVX1-NEXT: LBB22_7: ## %cond.load7
3126 ; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3127 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3128 ; AVX1-NEXT: testb $16, %al
3129 ; AVX1-NEXT: je LBB22_10
3130 ; AVX1-NEXT: LBB22_9: ## %cond.load10
3131 ; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3132 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3133 ; AVX1-NEXT: testb $32, %al
3134 ; AVX1-NEXT: je LBB22_12
3135 ; AVX1-NEXT: LBB22_11: ## %cond.load13
3136 ; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3137 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3138 ; AVX1-NEXT: testb $64, %al
3139 ; AVX1-NEXT: je LBB22_14
3140 ; AVX1-NEXT: LBB22_13: ## %cond.load16
3141 ; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3142 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3143 ; AVX1-NEXT: testb $-128, %al
3144 ; AVX1-NEXT: je LBB22_16
3145 ; AVX1-NEXT: LBB22_15: ## %cond.load19
3146 ; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3147 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3148 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
3149 ; AVX1-NEXT: je LBB22_18
3150 ; AVX1-NEXT: LBB22_17: ## %cond.load22
3151 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3152 ; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
3153 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3154 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
3155 ; AVX1-NEXT: je LBB22_20
3156 ; AVX1-NEXT: LBB22_19: ## %cond.load25
3157 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3158 ; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
3159 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3160 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
3161 ; AVX1-NEXT: je LBB22_22
3162 ; AVX1-NEXT: LBB22_21: ## %cond.load28
3163 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3164 ; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
3165 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3166 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
3167 ; AVX1-NEXT: je LBB22_24
3168 ; AVX1-NEXT: LBB22_23: ## %cond.load31
3169 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3170 ; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
3171 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3172 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
3173 ; AVX1-NEXT: je LBB22_26
3174 ; AVX1-NEXT: LBB22_25: ## %cond.load34
3175 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3176 ; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
3177 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3178 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
3179 ; AVX1-NEXT: je LBB22_28
3180 ; AVX1-NEXT: LBB22_27: ## %cond.load37
3181 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3182 ; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
3183 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3184 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
3185 ; AVX1-NEXT: je LBB22_30
3186 ; AVX1-NEXT: LBB22_29: ## %cond.load40
3187 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3188 ; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
3189 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3190 ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
3191 ; AVX1-NEXT: je LBB22_32
3192 ; AVX1-NEXT: LBB22_31: ## %cond.load43
3193 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3194 ; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
3195 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3196 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
3199 ; AVX2-LABEL: load_v16i16_v16i16:
3201 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3202 ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
3203 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
3204 ; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
3205 ; AVX2-NEXT: vpmovmskb %xmm0, %eax
3206 ; AVX2-NEXT: testb $1, %al
3207 ; AVX2-NEXT: jne LBB22_1
3208 ; AVX2-NEXT: ## %bb.2: ## %else
3209 ; AVX2-NEXT: testb $2, %al
3210 ; AVX2-NEXT: jne LBB22_3
3211 ; AVX2-NEXT: LBB22_4: ## %else2
3212 ; AVX2-NEXT: testb $4, %al
3213 ; AVX2-NEXT: jne LBB22_5
3214 ; AVX2-NEXT: LBB22_6: ## %else5
3215 ; AVX2-NEXT: testb $8, %al
3216 ; AVX2-NEXT: jne LBB22_7
3217 ; AVX2-NEXT: LBB22_8: ## %else8
3218 ; AVX2-NEXT: testb $16, %al
3219 ; AVX2-NEXT: jne LBB22_9
3220 ; AVX2-NEXT: LBB22_10: ## %else11
3221 ; AVX2-NEXT: testb $32, %al
3222 ; AVX2-NEXT: jne LBB22_11
3223 ; AVX2-NEXT: LBB22_12: ## %else14
3224 ; AVX2-NEXT: testb $64, %al
3225 ; AVX2-NEXT: jne LBB22_13
3226 ; AVX2-NEXT: LBB22_14: ## %else17
3227 ; AVX2-NEXT: testb $-128, %al
3228 ; AVX2-NEXT: jne LBB22_15
3229 ; AVX2-NEXT: LBB22_16: ## %else20
3230 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
3231 ; AVX2-NEXT: jne LBB22_17
3232 ; AVX2-NEXT: LBB22_18: ## %else23
3233 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
3234 ; AVX2-NEXT: jne LBB22_19
3235 ; AVX2-NEXT: LBB22_20: ## %else26
3236 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
3237 ; AVX2-NEXT: jne LBB22_21
3238 ; AVX2-NEXT: LBB22_22: ## %else29
3239 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
3240 ; AVX2-NEXT: jne LBB22_23
3241 ; AVX2-NEXT: LBB22_24: ## %else32
3242 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
3243 ; AVX2-NEXT: jne LBB22_25
3244 ; AVX2-NEXT: LBB22_26: ## %else35
3245 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
3246 ; AVX2-NEXT: jne LBB22_27
3247 ; AVX2-NEXT: LBB22_28: ## %else38
3248 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
3249 ; AVX2-NEXT: jne LBB22_29
3250 ; AVX2-NEXT: LBB22_30: ## %else41
3251 ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
3252 ; AVX2-NEXT: jne LBB22_31
3253 ; AVX2-NEXT: LBB22_32: ## %else44
3254 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
3256 ; AVX2-NEXT: LBB22_1: ## %cond.load
3257 ; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3258 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3259 ; AVX2-NEXT: testb $2, %al
3260 ; AVX2-NEXT: je LBB22_4
3261 ; AVX2-NEXT: LBB22_3: ## %cond.load1
3262 ; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3263 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3264 ; AVX2-NEXT: testb $4, %al
3265 ; AVX2-NEXT: je LBB22_6
3266 ; AVX2-NEXT: LBB22_5: ## %cond.load4
3267 ; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3268 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3269 ; AVX2-NEXT: testb $8, %al
3270 ; AVX2-NEXT: je LBB22_8
3271 ; AVX2-NEXT: LBB22_7: ## %cond.load7
3272 ; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3273 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3274 ; AVX2-NEXT: testb $16, %al
3275 ; AVX2-NEXT: je LBB22_10
3276 ; AVX2-NEXT: LBB22_9: ## %cond.load10
3277 ; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3278 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3279 ; AVX2-NEXT: testb $32, %al
3280 ; AVX2-NEXT: je LBB22_12
3281 ; AVX2-NEXT: LBB22_11: ## %cond.load13
3282 ; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3283 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3284 ; AVX2-NEXT: testb $64, %al
3285 ; AVX2-NEXT: je LBB22_14
3286 ; AVX2-NEXT: LBB22_13: ## %cond.load16
3287 ; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3288 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3289 ; AVX2-NEXT: testb $-128, %al
3290 ; AVX2-NEXT: je LBB22_16
3291 ; AVX2-NEXT: LBB22_15: ## %cond.load19
3292 ; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3293 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3294 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
3295 ; AVX2-NEXT: je LBB22_18
3296 ; AVX2-NEXT: LBB22_17: ## %cond.load22
3297 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3298 ; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
3299 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3300 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
3301 ; AVX2-NEXT: je LBB22_20
3302 ; AVX2-NEXT: LBB22_19: ## %cond.load25
3303 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3304 ; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
3305 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3306 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
3307 ; AVX2-NEXT: je LBB22_22
3308 ; AVX2-NEXT: LBB22_21: ## %cond.load28
3309 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3310 ; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
3311 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3312 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
3313 ; AVX2-NEXT: je LBB22_24
3314 ; AVX2-NEXT: LBB22_23: ## %cond.load31
3315 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3316 ; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
3317 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3318 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
3319 ; AVX2-NEXT: je LBB22_26
3320 ; AVX2-NEXT: LBB22_25: ## %cond.load34
3321 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3322 ; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
3323 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3324 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
3325 ; AVX2-NEXT: je LBB22_28
3326 ; AVX2-NEXT: LBB22_27: ## %cond.load37
3327 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3328 ; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
3329 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3330 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
3331 ; AVX2-NEXT: je LBB22_30
3332 ; AVX2-NEXT: LBB22_29: ## %cond.load40
3333 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3334 ; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
3335 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3336 ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
3337 ; AVX2-NEXT: je LBB22_32
3338 ; AVX2-NEXT: LBB22_31: ## %cond.load43
3339 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
3340 ; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
3341 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3342 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
3345 ; AVX512F-LABEL: load_v16i16_v16i16:
3346 ; AVX512F: ## %bb.0:
3347 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
3348 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
3349 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3350 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
3351 ; AVX512F-NEXT: kmovw %k0, %eax
3352 ; AVX512F-NEXT: testb $1, %al
3353 ; AVX512F-NEXT: jne LBB22_1
3354 ; AVX512F-NEXT: ## %bb.2: ## %else
3355 ; AVX512F-NEXT: testb $2, %al
3356 ; AVX512F-NEXT: jne LBB22_3
3357 ; AVX512F-NEXT: LBB22_4: ## %else2
3358 ; AVX512F-NEXT: testb $4, %al
3359 ; AVX512F-NEXT: jne LBB22_5
3360 ; AVX512F-NEXT: LBB22_6: ## %else5
3361 ; AVX512F-NEXT: testb $8, %al
3362 ; AVX512F-NEXT: jne LBB22_7
3363 ; AVX512F-NEXT: LBB22_8: ## %else8
3364 ; AVX512F-NEXT: testb $16, %al
3365 ; AVX512F-NEXT: jne LBB22_9
3366 ; AVX512F-NEXT: LBB22_10: ## %else11
3367 ; AVX512F-NEXT: testb $32, %al
3368 ; AVX512F-NEXT: jne LBB22_11
3369 ; AVX512F-NEXT: LBB22_12: ## %else14
3370 ; AVX512F-NEXT: testb $64, %al
3371 ; AVX512F-NEXT: jne LBB22_13
3372 ; AVX512F-NEXT: LBB22_14: ## %else17
3373 ; AVX512F-NEXT: testb $-128, %al
3374 ; AVX512F-NEXT: jne LBB22_15
3375 ; AVX512F-NEXT: LBB22_16: ## %else20
3376 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
3377 ; AVX512F-NEXT: jne LBB22_17
3378 ; AVX512F-NEXT: LBB22_18: ## %else23
3379 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
3380 ; AVX512F-NEXT: jne LBB22_19
3381 ; AVX512F-NEXT: LBB22_20: ## %else26
3382 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
3383 ; AVX512F-NEXT: jne LBB22_21
3384 ; AVX512F-NEXT: LBB22_22: ## %else29
3385 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
3386 ; AVX512F-NEXT: jne LBB22_23
3387 ; AVX512F-NEXT: LBB22_24: ## %else32
3388 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
3389 ; AVX512F-NEXT: jne LBB22_25
3390 ; AVX512F-NEXT: LBB22_26: ## %else35
3391 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
3392 ; AVX512F-NEXT: jne LBB22_27
3393 ; AVX512F-NEXT: LBB22_28: ## %else38
3394 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
3395 ; AVX512F-NEXT: jne LBB22_29
3396 ; AVX512F-NEXT: LBB22_30: ## %else41
3397 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
3398 ; AVX512F-NEXT: jne LBB22_31
3399 ; AVX512F-NEXT: LBB22_32: ## %else44
3400 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
3401 ; AVX512F-NEXT: retq
3402 ; AVX512F-NEXT: LBB22_1: ## %cond.load
3403 ; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3404 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3405 ; AVX512F-NEXT: testb $2, %al
3406 ; AVX512F-NEXT: je LBB22_4
3407 ; AVX512F-NEXT: LBB22_3: ## %cond.load1
3408 ; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3409 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3410 ; AVX512F-NEXT: testb $4, %al
3411 ; AVX512F-NEXT: je LBB22_6
3412 ; AVX512F-NEXT: LBB22_5: ## %cond.load4
3413 ; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3414 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3415 ; AVX512F-NEXT: testb $8, %al
3416 ; AVX512F-NEXT: je LBB22_8
3417 ; AVX512F-NEXT: LBB22_7: ## %cond.load7
3418 ; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3419 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3420 ; AVX512F-NEXT: testb $16, %al
3421 ; AVX512F-NEXT: je LBB22_10
3422 ; AVX512F-NEXT: LBB22_9: ## %cond.load10
3423 ; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3424 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3425 ; AVX512F-NEXT: testb $32, %al
3426 ; AVX512F-NEXT: je LBB22_12
3427 ; AVX512F-NEXT: LBB22_11: ## %cond.load13
3428 ; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3429 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3430 ; AVX512F-NEXT: testb $64, %al
3431 ; AVX512F-NEXT: je LBB22_14
3432 ; AVX512F-NEXT: LBB22_13: ## %cond.load16
3433 ; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3434 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3435 ; AVX512F-NEXT: testb $-128, %al
3436 ; AVX512F-NEXT: je LBB22_16
3437 ; AVX512F-NEXT: LBB22_15: ## %cond.load19
3438 ; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3439 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3440 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
3441 ; AVX512F-NEXT: je LBB22_18
3442 ; AVX512F-NEXT: LBB22_17: ## %cond.load22
3443 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3444 ; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
3445 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3446 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
3447 ; AVX512F-NEXT: je LBB22_20
3448 ; AVX512F-NEXT: LBB22_19: ## %cond.load25
3449 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3450 ; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
3451 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3452 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
3453 ; AVX512F-NEXT: je LBB22_22
3454 ; AVX512F-NEXT: LBB22_21: ## %cond.load28
3455 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3456 ; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
3457 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3458 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
3459 ; AVX512F-NEXT: je LBB22_24
3460 ; AVX512F-NEXT: LBB22_23: ## %cond.load31
3461 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3462 ; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
3463 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3464 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
3465 ; AVX512F-NEXT: je LBB22_26
3466 ; AVX512F-NEXT: LBB22_25: ## %cond.load34
3467 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3468 ; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
3469 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3470 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
3471 ; AVX512F-NEXT: je LBB22_28
3472 ; AVX512F-NEXT: LBB22_27: ## %cond.load37
3473 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3474 ; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
3475 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3476 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
3477 ; AVX512F-NEXT: je LBB22_30
3478 ; AVX512F-NEXT: LBB22_29: ## %cond.load40
3479 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3480 ; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
3481 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3482 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
3483 ; AVX512F-NEXT: je LBB22_32
3484 ; AVX512F-NEXT: LBB22_31: ## %cond.load43
3485 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
3486 ; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
3487 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3488 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
3489 ; AVX512F-NEXT: retq
3491 ; AVX512VLDQ-LABEL: load_v16i16_v16i16:
3492 ; AVX512VLDQ: ## %bb.0:
3493 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
3494 ; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
3495 ; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0
3496 ; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0
3497 ; AVX512VLDQ-NEXT: kmovw %k0, %eax
3498 ; AVX512VLDQ-NEXT: testb $1, %al
3499 ; AVX512VLDQ-NEXT: jne LBB22_1
3500 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
3501 ; AVX512VLDQ-NEXT: testb $2, %al
3502 ; AVX512VLDQ-NEXT: jne LBB22_3
3503 ; AVX512VLDQ-NEXT: LBB22_4: ## %else2
3504 ; AVX512VLDQ-NEXT: testb $4, %al
3505 ; AVX512VLDQ-NEXT: jne LBB22_5
3506 ; AVX512VLDQ-NEXT: LBB22_6: ## %else5
3507 ; AVX512VLDQ-NEXT: testb $8, %al
3508 ; AVX512VLDQ-NEXT: jne LBB22_7
3509 ; AVX512VLDQ-NEXT: LBB22_8: ## %else8
3510 ; AVX512VLDQ-NEXT: testb $16, %al
3511 ; AVX512VLDQ-NEXT: jne LBB22_9
3512 ; AVX512VLDQ-NEXT: LBB22_10: ## %else11
3513 ; AVX512VLDQ-NEXT: testb $32, %al
3514 ; AVX512VLDQ-NEXT: jne LBB22_11
3515 ; AVX512VLDQ-NEXT: LBB22_12: ## %else14
3516 ; AVX512VLDQ-NEXT: testb $64, %al
3517 ; AVX512VLDQ-NEXT: jne LBB22_13
3518 ; AVX512VLDQ-NEXT: LBB22_14: ## %else17
3519 ; AVX512VLDQ-NEXT: testb $-128, %al
3520 ; AVX512VLDQ-NEXT: jne LBB22_15
3521 ; AVX512VLDQ-NEXT: LBB22_16: ## %else20
3522 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
3523 ; AVX512VLDQ-NEXT: jne LBB22_17
3524 ; AVX512VLDQ-NEXT: LBB22_18: ## %else23
3525 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
3526 ; AVX512VLDQ-NEXT: jne LBB22_19
3527 ; AVX512VLDQ-NEXT: LBB22_20: ## %else26
3528 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
3529 ; AVX512VLDQ-NEXT: jne LBB22_21
3530 ; AVX512VLDQ-NEXT: LBB22_22: ## %else29
3531 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
3532 ; AVX512VLDQ-NEXT: jne LBB22_23
3533 ; AVX512VLDQ-NEXT: LBB22_24: ## %else32
3534 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
3535 ; AVX512VLDQ-NEXT: jne LBB22_25
3536 ; AVX512VLDQ-NEXT: LBB22_26: ## %else35
3537 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
3538 ; AVX512VLDQ-NEXT: jne LBB22_27
3539 ; AVX512VLDQ-NEXT: LBB22_28: ## %else38
3540 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
3541 ; AVX512VLDQ-NEXT: jne LBB22_29
3542 ; AVX512VLDQ-NEXT: LBB22_30: ## %else41
3543 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
3544 ; AVX512VLDQ-NEXT: jne LBB22_31
3545 ; AVX512VLDQ-NEXT: LBB22_32: ## %else44
3546 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
3547 ; AVX512VLDQ-NEXT: retq
3548 ; AVX512VLDQ-NEXT: LBB22_1: ## %cond.load
3549 ; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3550 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3551 ; AVX512VLDQ-NEXT: testb $2, %al
3552 ; AVX512VLDQ-NEXT: je LBB22_4
3553 ; AVX512VLDQ-NEXT: LBB22_3: ## %cond.load1
3554 ; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3555 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3556 ; AVX512VLDQ-NEXT: testb $4, %al
3557 ; AVX512VLDQ-NEXT: je LBB22_6
3558 ; AVX512VLDQ-NEXT: LBB22_5: ## %cond.load4
3559 ; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3560 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3561 ; AVX512VLDQ-NEXT: testb $8, %al
3562 ; AVX512VLDQ-NEXT: je LBB22_8
3563 ; AVX512VLDQ-NEXT: LBB22_7: ## %cond.load7
3564 ; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3565 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3566 ; AVX512VLDQ-NEXT: testb $16, %al
3567 ; AVX512VLDQ-NEXT: je LBB22_10
3568 ; AVX512VLDQ-NEXT: LBB22_9: ## %cond.load10
3569 ; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3570 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3571 ; AVX512VLDQ-NEXT: testb $32, %al
3572 ; AVX512VLDQ-NEXT: je LBB22_12
3573 ; AVX512VLDQ-NEXT: LBB22_11: ## %cond.load13
3574 ; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3575 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3576 ; AVX512VLDQ-NEXT: testb $64, %al
3577 ; AVX512VLDQ-NEXT: je LBB22_14
3578 ; AVX512VLDQ-NEXT: LBB22_13: ## %cond.load16
3579 ; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3580 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3581 ; AVX512VLDQ-NEXT: testb $-128, %al
3582 ; AVX512VLDQ-NEXT: je LBB22_16
3583 ; AVX512VLDQ-NEXT: LBB22_15: ## %cond.load19
3584 ; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3585 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3586 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
3587 ; AVX512VLDQ-NEXT: je LBB22_18
3588 ; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22
3589 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3590 ; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
3591 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3592 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
3593 ; AVX512VLDQ-NEXT: je LBB22_20
3594 ; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25
3595 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3596 ; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
3597 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3598 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
3599 ; AVX512VLDQ-NEXT: je LBB22_22
3600 ; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28
3601 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3602 ; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
3603 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3604 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
3605 ; AVX512VLDQ-NEXT: je LBB22_24
3606 ; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31
3607 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3608 ; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
3609 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3610 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
3611 ; AVX512VLDQ-NEXT: je LBB22_26
3612 ; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34
3613 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3614 ; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
3615 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3616 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
3617 ; AVX512VLDQ-NEXT: je LBB22_28
3618 ; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37
3619 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3620 ; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
3621 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3622 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
3623 ; AVX512VLDQ-NEXT: je LBB22_30
3624 ; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40
3625 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3626 ; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
3627 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3628 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
3629 ; AVX512VLDQ-NEXT: je LBB22_32
3630 ; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43
3631 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
3632 ; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
3633 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3634 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
3635 ; AVX512VLDQ-NEXT: retq
3637 ; AVX512VLBW-LABEL: load_v16i16_v16i16:
3638 ; AVX512VLBW: ## %bb.0:
3639 ; AVX512VLBW-NEXT: vpmovw2m %ymm0, %k1
3640 ; AVX512VLBW-NEXT: vpblendmw (%rdi), %ymm1, %ymm0 {%k1}
3641 ; AVX512VLBW-NEXT: retq
3642 %mask = icmp slt <16 x i16> %trigger, zeroinitializer
3643 %res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1> %mask, <16 x i16> %dst)
3651 define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i8> %dst) {
3652 ; SSE2-LABEL: load_v16i8_v16i8:
3654 ; SSE2-NEXT: pmovmskb %xmm0, %eax
3655 ; SSE2-NEXT: testb $1, %al
3656 ; SSE2-NEXT: jne LBB23_1
3657 ; SSE2-NEXT: ## %bb.2: ## %else
3658 ; SSE2-NEXT: testb $2, %al
3659 ; SSE2-NEXT: jne LBB23_3
3660 ; SSE2-NEXT: LBB23_4: ## %else2
3661 ; SSE2-NEXT: testb $4, %al
3662 ; SSE2-NEXT: jne LBB23_5
3663 ; SSE2-NEXT: LBB23_6: ## %else5
3664 ; SSE2-NEXT: testb $8, %al
3665 ; SSE2-NEXT: jne LBB23_7
3666 ; SSE2-NEXT: LBB23_8: ## %else8
3667 ; SSE2-NEXT: testb $16, %al
3668 ; SSE2-NEXT: jne LBB23_9
3669 ; SSE2-NEXT: LBB23_10: ## %else11
3670 ; SSE2-NEXT: testb $32, %al
3671 ; SSE2-NEXT: jne LBB23_11
3672 ; SSE2-NEXT: LBB23_12: ## %else14
3673 ; SSE2-NEXT: testb $64, %al
3674 ; SSE2-NEXT: jne LBB23_13
3675 ; SSE2-NEXT: LBB23_14: ## %else17
3676 ; SSE2-NEXT: testb $-128, %al
3677 ; SSE2-NEXT: jne LBB23_15
3678 ; SSE2-NEXT: LBB23_16: ## %else20
3679 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
3680 ; SSE2-NEXT: jne LBB23_17
3681 ; SSE2-NEXT: LBB23_18: ## %else23
3682 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
3683 ; SSE2-NEXT: jne LBB23_19
3684 ; SSE2-NEXT: LBB23_20: ## %else26
3685 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
3686 ; SSE2-NEXT: jne LBB23_21
3687 ; SSE2-NEXT: LBB23_22: ## %else29
3688 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
3689 ; SSE2-NEXT: jne LBB23_23
3690 ; SSE2-NEXT: LBB23_24: ## %else32
3691 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
3692 ; SSE2-NEXT: jne LBB23_25
3693 ; SSE2-NEXT: LBB23_26: ## %else35
3694 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
3695 ; SSE2-NEXT: jne LBB23_27
3696 ; SSE2-NEXT: LBB23_28: ## %else38
3697 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
3698 ; SSE2-NEXT: jne LBB23_29
3699 ; SSE2-NEXT: LBB23_30: ## %else41
3700 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
3701 ; SSE2-NEXT: jne LBB23_31
3702 ; SSE2-NEXT: LBB23_32: ## %else44
3703 ; SSE2-NEXT: movdqa %xmm1, %xmm0
3705 ; SSE2-NEXT: LBB23_1: ## %cond.load
3706 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3707 ; SSE2-NEXT: pand %xmm0, %xmm1
3708 ; SSE2-NEXT: movzbl (%rdi), %ecx
3709 ; SSE2-NEXT: movd %ecx, %xmm2
3710 ; SSE2-NEXT: pandn %xmm2, %xmm0
3711 ; SSE2-NEXT: por %xmm0, %xmm1
3712 ; SSE2-NEXT: testb $2, %al
3713 ; SSE2-NEXT: je LBB23_4
3714 ; SSE2-NEXT: LBB23_3: ## %cond.load1
3715 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3716 ; SSE2-NEXT: pand %xmm0, %xmm1
3717 ; SSE2-NEXT: movzbl 1(%rdi), %ecx
3718 ; SSE2-NEXT: movd %ecx, %xmm2
3719 ; SSE2-NEXT: psllw $8, %xmm2
3720 ; SSE2-NEXT: pandn %xmm2, %xmm0
3721 ; SSE2-NEXT: por %xmm0, %xmm1
3722 ; SSE2-NEXT: testb $4, %al
3723 ; SSE2-NEXT: je LBB23_6
3724 ; SSE2-NEXT: LBB23_5: ## %cond.load4
3725 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
3726 ; SSE2-NEXT: pand %xmm0, %xmm1
3727 ; SSE2-NEXT: movzbl 2(%rdi), %ecx
3728 ; SSE2-NEXT: movd %ecx, %xmm2
3729 ; SSE2-NEXT: pslld $16, %xmm2
3730 ; SSE2-NEXT: pandn %xmm2, %xmm0
3731 ; SSE2-NEXT: por %xmm0, %xmm1
3732 ; SSE2-NEXT: testb $8, %al
3733 ; SSE2-NEXT: je LBB23_8
3734 ; SSE2-NEXT: LBB23_7: ## %cond.load7
3735 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
3736 ; SSE2-NEXT: pand %xmm0, %xmm1
3737 ; SSE2-NEXT: movzbl 3(%rdi), %ecx
3738 ; SSE2-NEXT: movd %ecx, %xmm2
3739 ; SSE2-NEXT: pslld $24, %xmm2
3740 ; SSE2-NEXT: pandn %xmm2, %xmm0
3741 ; SSE2-NEXT: por %xmm0, %xmm1
3742 ; SSE2-NEXT: testb $16, %al
3743 ; SSE2-NEXT: je LBB23_10
3744 ; SSE2-NEXT: LBB23_9: ## %cond.load10
3745 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
3746 ; SSE2-NEXT: pand %xmm0, %xmm1
3747 ; SSE2-NEXT: movzbl 4(%rdi), %ecx
3748 ; SSE2-NEXT: movd %ecx, %xmm2
3749 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
3750 ; SSE2-NEXT: pandn %xmm2, %xmm0
3751 ; SSE2-NEXT: por %xmm0, %xmm1
3752 ; SSE2-NEXT: testb $32, %al
3753 ; SSE2-NEXT: je LBB23_12
3754 ; SSE2-NEXT: LBB23_11: ## %cond.load13
3755 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
3756 ; SSE2-NEXT: pand %xmm0, %xmm1
3757 ; SSE2-NEXT: movzbl 5(%rdi), %ecx
3758 ; SSE2-NEXT: movd %ecx, %xmm2
3759 ; SSE2-NEXT: psllq $40, %xmm2
3760 ; SSE2-NEXT: pandn %xmm2, %xmm0
3761 ; SSE2-NEXT: por %xmm0, %xmm1
3762 ; SSE2-NEXT: testb $64, %al
3763 ; SSE2-NEXT: je LBB23_14
3764 ; SSE2-NEXT: LBB23_13: ## %cond.load16
3765 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
3766 ; SSE2-NEXT: pand %xmm0, %xmm1
3767 ; SSE2-NEXT: movzbl 6(%rdi), %ecx
3768 ; SSE2-NEXT: movd %ecx, %xmm2
3769 ; SSE2-NEXT: psllq $48, %xmm2
3770 ; SSE2-NEXT: pandn %xmm2, %xmm0
3771 ; SSE2-NEXT: por %xmm0, %xmm1
3772 ; SSE2-NEXT: testb $-128, %al
3773 ; SSE2-NEXT: je LBB23_16
3774 ; SSE2-NEXT: LBB23_15: ## %cond.load19
3775 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
3776 ; SSE2-NEXT: pand %xmm0, %xmm1
3777 ; SSE2-NEXT: movzbl 7(%rdi), %ecx
3778 ; SSE2-NEXT: movd %ecx, %xmm2
3779 ; SSE2-NEXT: psllq $56, %xmm2
3780 ; SSE2-NEXT: pandn %xmm2, %xmm0
3781 ; SSE2-NEXT: por %xmm0, %xmm1
3782 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
3783 ; SSE2-NEXT: je LBB23_18
3784 ; SSE2-NEXT: LBB23_17: ## %cond.load22
3785 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3786 ; SSE2-NEXT: pand %xmm0, %xmm1
3787 ; SSE2-NEXT: movzbl 8(%rdi), %ecx
3788 ; SSE2-NEXT: movd %ecx, %xmm2
3789 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
3790 ; SSE2-NEXT: pandn %xmm2, %xmm0
3791 ; SSE2-NEXT: por %xmm0, %xmm1
3792 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
3793 ; SSE2-NEXT: je LBB23_20
3794 ; SSE2-NEXT: LBB23_19: ## %cond.load25
3795 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
3796 ; SSE2-NEXT: pand %xmm0, %xmm1
3797 ; SSE2-NEXT: movzbl 9(%rdi), %ecx
3798 ; SSE2-NEXT: movd %ecx, %xmm2
3799 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
3800 ; SSE2-NEXT: pandn %xmm2, %xmm0
3801 ; SSE2-NEXT: por %xmm0, %xmm1
3802 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
3803 ; SSE2-NEXT: je LBB23_22
3804 ; SSE2-NEXT: LBB23_21: ## %cond.load28
3805 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
3806 ; SSE2-NEXT: pand %xmm0, %xmm1
3807 ; SSE2-NEXT: movzbl 10(%rdi), %ecx
3808 ; SSE2-NEXT: movd %ecx, %xmm2
3809 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
3810 ; SSE2-NEXT: pandn %xmm2, %xmm0
3811 ; SSE2-NEXT: por %xmm0, %xmm1
3812 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
3813 ; SSE2-NEXT: je LBB23_24
3814 ; SSE2-NEXT: LBB23_23: ## %cond.load31
3815 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
3816 ; SSE2-NEXT: pand %xmm0, %xmm1
3817 ; SSE2-NEXT: movzbl 11(%rdi), %ecx
3818 ; SSE2-NEXT: movd %ecx, %xmm2
3819 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
3820 ; SSE2-NEXT: pandn %xmm2, %xmm0
3821 ; SSE2-NEXT: por %xmm0, %xmm1
3822 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
3823 ; SSE2-NEXT: je LBB23_26
3824 ; SSE2-NEXT: LBB23_25: ## %cond.load34
3825 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3826 ; SSE2-NEXT: pand %xmm0, %xmm1
3827 ; SSE2-NEXT: movzbl 12(%rdi), %ecx
3828 ; SSE2-NEXT: movd %ecx, %xmm2
3829 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
3830 ; SSE2-NEXT: pandn %xmm2, %xmm0
3831 ; SSE2-NEXT: por %xmm0, %xmm1
3832 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
3833 ; SSE2-NEXT: je LBB23_28
3834 ; SSE2-NEXT: LBB23_27: ## %cond.load37
3835 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
3836 ; SSE2-NEXT: pand %xmm0, %xmm1
3837 ; SSE2-NEXT: movzbl 13(%rdi), %ecx
3838 ; SSE2-NEXT: movd %ecx, %xmm2
3839 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3840 ; SSE2-NEXT: pandn %xmm2, %xmm0
3841 ; SSE2-NEXT: por %xmm0, %xmm1
3842 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
3843 ; SSE2-NEXT: je LBB23_30
3844 ; SSE2-NEXT: LBB23_29: ## %cond.load40
3845 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
3846 ; SSE2-NEXT: pand %xmm0, %xmm1
3847 ; SSE2-NEXT: movzbl 14(%rdi), %ecx
3848 ; SSE2-NEXT: movd %ecx, %xmm2
3849 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
3850 ; SSE2-NEXT: pandn %xmm2, %xmm0
3851 ; SSE2-NEXT: por %xmm0, %xmm1
3852 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
3853 ; SSE2-NEXT: je LBB23_32
3854 ; SSE2-NEXT: LBB23_31: ## %cond.load43
3855 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
3856 ; SSE2-NEXT: movzbl 15(%rdi), %eax
3857 ; SSE2-NEXT: movd %eax, %xmm0
3858 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
3859 ; SSE2-NEXT: por %xmm0, %xmm1
3860 ; SSE2-NEXT: movdqa %xmm1, %xmm0
3863 ; SSE42-LABEL: load_v16i8_v16i8:
3865 ; SSE42-NEXT: pmovmskb %xmm0, %eax
3866 ; SSE42-NEXT: testb $1, %al
3867 ; SSE42-NEXT: jne LBB23_1
3868 ; SSE42-NEXT: ## %bb.2: ## %else
3869 ; SSE42-NEXT: testb $2, %al
3870 ; SSE42-NEXT: jne LBB23_3
3871 ; SSE42-NEXT: LBB23_4: ## %else2
3872 ; SSE42-NEXT: testb $4, %al
3873 ; SSE42-NEXT: jne LBB23_5
3874 ; SSE42-NEXT: LBB23_6: ## %else5
3875 ; SSE42-NEXT: testb $8, %al
3876 ; SSE42-NEXT: jne LBB23_7
3877 ; SSE42-NEXT: LBB23_8: ## %else8
3878 ; SSE42-NEXT: testb $16, %al
3879 ; SSE42-NEXT: jne LBB23_9
3880 ; SSE42-NEXT: LBB23_10: ## %else11
3881 ; SSE42-NEXT: testb $32, %al
3882 ; SSE42-NEXT: jne LBB23_11
3883 ; SSE42-NEXT: LBB23_12: ## %else14
3884 ; SSE42-NEXT: testb $64, %al
3885 ; SSE42-NEXT: jne LBB23_13
3886 ; SSE42-NEXT: LBB23_14: ## %else17
3887 ; SSE42-NEXT: testb $-128, %al
3888 ; SSE42-NEXT: jne LBB23_15
3889 ; SSE42-NEXT: LBB23_16: ## %else20
3890 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
3891 ; SSE42-NEXT: jne LBB23_17
3892 ; SSE42-NEXT: LBB23_18: ## %else23
3893 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
3894 ; SSE42-NEXT: jne LBB23_19
3895 ; SSE42-NEXT: LBB23_20: ## %else26
3896 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
3897 ; SSE42-NEXT: jne LBB23_21
3898 ; SSE42-NEXT: LBB23_22: ## %else29
3899 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
3900 ; SSE42-NEXT: jne LBB23_23
3901 ; SSE42-NEXT: LBB23_24: ## %else32
3902 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
3903 ; SSE42-NEXT: jne LBB23_25
3904 ; SSE42-NEXT: LBB23_26: ## %else35
3905 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
3906 ; SSE42-NEXT: jne LBB23_27
3907 ; SSE42-NEXT: LBB23_28: ## %else38
3908 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
3909 ; SSE42-NEXT: jne LBB23_29
3910 ; SSE42-NEXT: LBB23_30: ## %else41
3911 ; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
3912 ; SSE42-NEXT: jne LBB23_31
3913 ; SSE42-NEXT: LBB23_32: ## %else44
3914 ; SSE42-NEXT: movdqa %xmm1, %xmm0
3916 ; SSE42-NEXT: LBB23_1: ## %cond.load
3917 ; SSE42-NEXT: pinsrb $0, (%rdi), %xmm1
3918 ; SSE42-NEXT: testb $2, %al
3919 ; SSE42-NEXT: je LBB23_4
3920 ; SSE42-NEXT: LBB23_3: ## %cond.load1
3921 ; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm1
3922 ; SSE42-NEXT: testb $4, %al
3923 ; SSE42-NEXT: je LBB23_6
3924 ; SSE42-NEXT: LBB23_5: ## %cond.load4
3925 ; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm1
3926 ; SSE42-NEXT: testb $8, %al
3927 ; SSE42-NEXT: je LBB23_8
3928 ; SSE42-NEXT: LBB23_7: ## %cond.load7
3929 ; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm1
3930 ; SSE42-NEXT: testb $16, %al
3931 ; SSE42-NEXT: je LBB23_10
3932 ; SSE42-NEXT: LBB23_9: ## %cond.load10
3933 ; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm1
3934 ; SSE42-NEXT: testb $32, %al
3935 ; SSE42-NEXT: je LBB23_12
3936 ; SSE42-NEXT: LBB23_11: ## %cond.load13
3937 ; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm1
3938 ; SSE42-NEXT: testb $64, %al
3939 ; SSE42-NEXT: je LBB23_14
3940 ; SSE42-NEXT: LBB23_13: ## %cond.load16
3941 ; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm1
3942 ; SSE42-NEXT: testb $-128, %al
3943 ; SSE42-NEXT: je LBB23_16
3944 ; SSE42-NEXT: LBB23_15: ## %cond.load19
3945 ; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm1
3946 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
3947 ; SSE42-NEXT: je LBB23_18
3948 ; SSE42-NEXT: LBB23_17: ## %cond.load22
3949 ; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm1
3950 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
3951 ; SSE42-NEXT: je LBB23_20
3952 ; SSE42-NEXT: LBB23_19: ## %cond.load25
3953 ; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm1
3954 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
3955 ; SSE42-NEXT: je LBB23_22
3956 ; SSE42-NEXT: LBB23_21: ## %cond.load28
3957 ; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm1
3958 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
3959 ; SSE42-NEXT: je LBB23_24
3960 ; SSE42-NEXT: LBB23_23: ## %cond.load31
3961 ; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm1
3962 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
3963 ; SSE42-NEXT: je LBB23_26
3964 ; SSE42-NEXT: LBB23_25: ## %cond.load34
3965 ; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm1
3966 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
3967 ; SSE42-NEXT: je LBB23_28
3968 ; SSE42-NEXT: LBB23_27: ## %cond.load37
3969 ; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm1
3970 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
3971 ; SSE42-NEXT: je LBB23_30
3972 ; SSE42-NEXT: LBB23_29: ## %cond.load40
3973 ; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm1
3974 ; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
3975 ; SSE42-NEXT: je LBB23_32
3976 ; SSE42-NEXT: LBB23_31: ## %cond.load43
3977 ; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm1
3978 ; SSE42-NEXT: movdqa %xmm1, %xmm0
3981 ; AVX1OR2-LABEL: load_v16i8_v16i8:
3982 ; AVX1OR2: ## %bb.0:
3983 ; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
3984 ; AVX1OR2-NEXT: testb $1, %al
3985 ; AVX1OR2-NEXT: jne LBB23_1
3986 ; AVX1OR2-NEXT: ## %bb.2: ## %else
3987 ; AVX1OR2-NEXT: testb $2, %al
3988 ; AVX1OR2-NEXT: jne LBB23_3
3989 ; AVX1OR2-NEXT: LBB23_4: ## %else2
3990 ; AVX1OR2-NEXT: testb $4, %al
3991 ; AVX1OR2-NEXT: jne LBB23_5
3992 ; AVX1OR2-NEXT: LBB23_6: ## %else5
3993 ; AVX1OR2-NEXT: testb $8, %al
3994 ; AVX1OR2-NEXT: jne LBB23_7
3995 ; AVX1OR2-NEXT: LBB23_8: ## %else8
3996 ; AVX1OR2-NEXT: testb $16, %al
3997 ; AVX1OR2-NEXT: jne LBB23_9
3998 ; AVX1OR2-NEXT: LBB23_10: ## %else11
3999 ; AVX1OR2-NEXT: testb $32, %al
4000 ; AVX1OR2-NEXT: jne LBB23_11
4001 ; AVX1OR2-NEXT: LBB23_12: ## %else14
4002 ; AVX1OR2-NEXT: testb $64, %al
4003 ; AVX1OR2-NEXT: jne LBB23_13
4004 ; AVX1OR2-NEXT: LBB23_14: ## %else17
4005 ; AVX1OR2-NEXT: testb $-128, %al
4006 ; AVX1OR2-NEXT: jne LBB23_15
4007 ; AVX1OR2-NEXT: LBB23_16: ## %else20
4008 ; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
4009 ; AVX1OR2-NEXT: jne LBB23_17
4010 ; AVX1OR2-NEXT: LBB23_18: ## %else23
4011 ; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
4012 ; AVX1OR2-NEXT: jne LBB23_19
4013 ; AVX1OR2-NEXT: LBB23_20: ## %else26
4014 ; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400
4015 ; AVX1OR2-NEXT: jne LBB23_21
4016 ; AVX1OR2-NEXT: LBB23_22: ## %else29
4017 ; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
4018 ; AVX1OR2-NEXT: jne LBB23_23
4019 ; AVX1OR2-NEXT: LBB23_24: ## %else32
4020 ; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
4021 ; AVX1OR2-NEXT: jne LBB23_25
4022 ; AVX1OR2-NEXT: LBB23_26: ## %else35
4023 ; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
4024 ; AVX1OR2-NEXT: jne LBB23_27
4025 ; AVX1OR2-NEXT: LBB23_28: ## %else38
4026 ; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000
4027 ; AVX1OR2-NEXT: jne LBB23_29
4028 ; AVX1OR2-NEXT: LBB23_30: ## %else41
4029 ; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
4030 ; AVX1OR2-NEXT: jne LBB23_31
4031 ; AVX1OR2-NEXT: LBB23_32: ## %else44
4032 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
4033 ; AVX1OR2-NEXT: retq
4034 ; AVX1OR2-NEXT: LBB23_1: ## %cond.load
4035 ; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
4036 ; AVX1OR2-NEXT: testb $2, %al
4037 ; AVX1OR2-NEXT: je LBB23_4
4038 ; AVX1OR2-NEXT: LBB23_3: ## %cond.load1
4039 ; AVX1OR2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4040 ; AVX1OR2-NEXT: testb $4, %al
4041 ; AVX1OR2-NEXT: je LBB23_6
4042 ; AVX1OR2-NEXT: LBB23_5: ## %cond.load4
4043 ; AVX1OR2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4044 ; AVX1OR2-NEXT: testb $8, %al
4045 ; AVX1OR2-NEXT: je LBB23_8
4046 ; AVX1OR2-NEXT: LBB23_7: ## %cond.load7
4047 ; AVX1OR2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4048 ; AVX1OR2-NEXT: testb $16, %al
4049 ; AVX1OR2-NEXT: je LBB23_10
4050 ; AVX1OR2-NEXT: LBB23_9: ## %cond.load10
4051 ; AVX1OR2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4052 ; AVX1OR2-NEXT: testb $32, %al
4053 ; AVX1OR2-NEXT: je LBB23_12
4054 ; AVX1OR2-NEXT: LBB23_11: ## %cond.load13
4055 ; AVX1OR2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4056 ; AVX1OR2-NEXT: testb $64, %al
4057 ; AVX1OR2-NEXT: je LBB23_14
4058 ; AVX1OR2-NEXT: LBB23_13: ## %cond.load16
4059 ; AVX1OR2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4060 ; AVX1OR2-NEXT: testb $-128, %al
4061 ; AVX1OR2-NEXT: je LBB23_16
4062 ; AVX1OR2-NEXT: LBB23_15: ## %cond.load19
4063 ; AVX1OR2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4064 ; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
4065 ; AVX1OR2-NEXT: je LBB23_18
4066 ; AVX1OR2-NEXT: LBB23_17: ## %cond.load22
4067 ; AVX1OR2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4068 ; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
4069 ; AVX1OR2-NEXT: je LBB23_20
4070 ; AVX1OR2-NEXT: LBB23_19: ## %cond.load25
4071 ; AVX1OR2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4072 ; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400
4073 ; AVX1OR2-NEXT: je LBB23_22
4074 ; AVX1OR2-NEXT: LBB23_21: ## %cond.load28
4075 ; AVX1OR2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4076 ; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
4077 ; AVX1OR2-NEXT: je LBB23_24
4078 ; AVX1OR2-NEXT: LBB23_23: ## %cond.load31
4079 ; AVX1OR2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4080 ; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
4081 ; AVX1OR2-NEXT: je LBB23_26
4082 ; AVX1OR2-NEXT: LBB23_25: ## %cond.load34
4083 ; AVX1OR2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4084 ; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
4085 ; AVX1OR2-NEXT: je LBB23_28
4086 ; AVX1OR2-NEXT: LBB23_27: ## %cond.load37
4087 ; AVX1OR2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4088 ; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000
4089 ; AVX1OR2-NEXT: je LBB23_30
4090 ; AVX1OR2-NEXT: LBB23_29: ## %cond.load40
4091 ; AVX1OR2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4092 ; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
4093 ; AVX1OR2-NEXT: je LBB23_32
4094 ; AVX1OR2-NEXT: LBB23_31: ## %cond.load43
4095 ; AVX1OR2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4096 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
4097 ; AVX1OR2-NEXT: retq
4099 ; AVX512F-LABEL: load_v16i8_v16i8:
4100 ; AVX512F: ## %bb.0:
4101 ; AVX512F-NEXT: vpmovmskb %xmm0, %eax
4102 ; AVX512F-NEXT: testb $1, %al
4103 ; AVX512F-NEXT: jne LBB23_1
4104 ; AVX512F-NEXT: ## %bb.2: ## %else
4105 ; AVX512F-NEXT: testb $2, %al
4106 ; AVX512F-NEXT: jne LBB23_3
4107 ; AVX512F-NEXT: LBB23_4: ## %else2
4108 ; AVX512F-NEXT: testb $4, %al
4109 ; AVX512F-NEXT: jne LBB23_5
4110 ; AVX512F-NEXT: LBB23_6: ## %else5
4111 ; AVX512F-NEXT: testb $8, %al
4112 ; AVX512F-NEXT: jne LBB23_7
4113 ; AVX512F-NEXT: LBB23_8: ## %else8
4114 ; AVX512F-NEXT: testb $16, %al
4115 ; AVX512F-NEXT: jne LBB23_9
4116 ; AVX512F-NEXT: LBB23_10: ## %else11
4117 ; AVX512F-NEXT: testb $32, %al
4118 ; AVX512F-NEXT: jne LBB23_11
4119 ; AVX512F-NEXT: LBB23_12: ## %else14
4120 ; AVX512F-NEXT: testb $64, %al
4121 ; AVX512F-NEXT: jne LBB23_13
4122 ; AVX512F-NEXT: LBB23_14: ## %else17
4123 ; AVX512F-NEXT: testb $-128, %al
4124 ; AVX512F-NEXT: jne LBB23_15
4125 ; AVX512F-NEXT: LBB23_16: ## %else20
4126 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
4127 ; AVX512F-NEXT: jne LBB23_17
4128 ; AVX512F-NEXT: LBB23_18: ## %else23
4129 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
4130 ; AVX512F-NEXT: jne LBB23_19
4131 ; AVX512F-NEXT: LBB23_20: ## %else26
4132 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
4133 ; AVX512F-NEXT: jne LBB23_21
4134 ; AVX512F-NEXT: LBB23_22: ## %else29
4135 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
4136 ; AVX512F-NEXT: jne LBB23_23
4137 ; AVX512F-NEXT: LBB23_24: ## %else32
4138 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
4139 ; AVX512F-NEXT: jne LBB23_25
4140 ; AVX512F-NEXT: LBB23_26: ## %else35
4141 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
4142 ; AVX512F-NEXT: jne LBB23_27
4143 ; AVX512F-NEXT: LBB23_28: ## %else38
4144 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
4145 ; AVX512F-NEXT: jne LBB23_29
4146 ; AVX512F-NEXT: LBB23_30: ## %else41
4147 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
4148 ; AVX512F-NEXT: jne LBB23_31
4149 ; AVX512F-NEXT: LBB23_32: ## %else44
4150 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
4151 ; AVX512F-NEXT: retq
4152 ; AVX512F-NEXT: LBB23_1: ## %cond.load
4153 ; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
4154 ; AVX512F-NEXT: testb $2, %al
4155 ; AVX512F-NEXT: je LBB23_4
4156 ; AVX512F-NEXT: LBB23_3: ## %cond.load1
4157 ; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4158 ; AVX512F-NEXT: testb $4, %al
4159 ; AVX512F-NEXT: je LBB23_6
4160 ; AVX512F-NEXT: LBB23_5: ## %cond.load4
4161 ; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4162 ; AVX512F-NEXT: testb $8, %al
4163 ; AVX512F-NEXT: je LBB23_8
4164 ; AVX512F-NEXT: LBB23_7: ## %cond.load7
4165 ; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4166 ; AVX512F-NEXT: testb $16, %al
4167 ; AVX512F-NEXT: je LBB23_10
4168 ; AVX512F-NEXT: LBB23_9: ## %cond.load10
4169 ; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4170 ; AVX512F-NEXT: testb $32, %al
4171 ; AVX512F-NEXT: je LBB23_12
4172 ; AVX512F-NEXT: LBB23_11: ## %cond.load13
4173 ; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4174 ; AVX512F-NEXT: testb $64, %al
4175 ; AVX512F-NEXT: je LBB23_14
4176 ; AVX512F-NEXT: LBB23_13: ## %cond.load16
4177 ; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4178 ; AVX512F-NEXT: testb $-128, %al
4179 ; AVX512F-NEXT: je LBB23_16
4180 ; AVX512F-NEXT: LBB23_15: ## %cond.load19
4181 ; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4182 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
4183 ; AVX512F-NEXT: je LBB23_18
4184 ; AVX512F-NEXT: LBB23_17: ## %cond.load22
4185 ; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4186 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
4187 ; AVX512F-NEXT: je LBB23_20
4188 ; AVX512F-NEXT: LBB23_19: ## %cond.load25
4189 ; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4190 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
4191 ; AVX512F-NEXT: je LBB23_22
4192 ; AVX512F-NEXT: LBB23_21: ## %cond.load28
4193 ; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4194 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
4195 ; AVX512F-NEXT: je LBB23_24
4196 ; AVX512F-NEXT: LBB23_23: ## %cond.load31
4197 ; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4198 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
4199 ; AVX512F-NEXT: je LBB23_26
4200 ; AVX512F-NEXT: LBB23_25: ## %cond.load34
4201 ; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4202 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
4203 ; AVX512F-NEXT: je LBB23_28
4204 ; AVX512F-NEXT: LBB23_27: ## %cond.load37
4205 ; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4206 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
4207 ; AVX512F-NEXT: je LBB23_30
4208 ; AVX512F-NEXT: LBB23_29: ## %cond.load40
4209 ; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4210 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
4211 ; AVX512F-NEXT: je LBB23_32
4212 ; AVX512F-NEXT: LBB23_31: ## %cond.load43
4213 ; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4214 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
4215 ; AVX512F-NEXT: retq
4217 ; AVX512VLDQ-LABEL: load_v16i8_v16i8:
4218 ; AVX512VLDQ: ## %bb.0:
4219 ; AVX512VLDQ-NEXT: vpmovmskb %xmm0, %eax
4220 ; AVX512VLDQ-NEXT: testb $1, %al
4221 ; AVX512VLDQ-NEXT: jne LBB23_1
4222 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
4223 ; AVX512VLDQ-NEXT: testb $2, %al
4224 ; AVX512VLDQ-NEXT: jne LBB23_3
4225 ; AVX512VLDQ-NEXT: LBB23_4: ## %else2
4226 ; AVX512VLDQ-NEXT: testb $4, %al
4227 ; AVX512VLDQ-NEXT: jne LBB23_5
4228 ; AVX512VLDQ-NEXT: LBB23_6: ## %else5
4229 ; AVX512VLDQ-NEXT: testb $8, %al
4230 ; AVX512VLDQ-NEXT: jne LBB23_7
4231 ; AVX512VLDQ-NEXT: LBB23_8: ## %else8
4232 ; AVX512VLDQ-NEXT: testb $16, %al
4233 ; AVX512VLDQ-NEXT: jne LBB23_9
4234 ; AVX512VLDQ-NEXT: LBB23_10: ## %else11
4235 ; AVX512VLDQ-NEXT: testb $32, %al
4236 ; AVX512VLDQ-NEXT: jne LBB23_11
4237 ; AVX512VLDQ-NEXT: LBB23_12: ## %else14
4238 ; AVX512VLDQ-NEXT: testb $64, %al
4239 ; AVX512VLDQ-NEXT: jne LBB23_13
4240 ; AVX512VLDQ-NEXT: LBB23_14: ## %else17
4241 ; AVX512VLDQ-NEXT: testb $-128, %al
4242 ; AVX512VLDQ-NEXT: jne LBB23_15
4243 ; AVX512VLDQ-NEXT: LBB23_16: ## %else20
4244 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
4245 ; AVX512VLDQ-NEXT: jne LBB23_17
4246 ; AVX512VLDQ-NEXT: LBB23_18: ## %else23
4247 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
4248 ; AVX512VLDQ-NEXT: jne LBB23_19
4249 ; AVX512VLDQ-NEXT: LBB23_20: ## %else26
4250 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
4251 ; AVX512VLDQ-NEXT: jne LBB23_21
4252 ; AVX512VLDQ-NEXT: LBB23_22: ## %else29
4253 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
4254 ; AVX512VLDQ-NEXT: jne LBB23_23
4255 ; AVX512VLDQ-NEXT: LBB23_24: ## %else32
4256 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
4257 ; AVX512VLDQ-NEXT: jne LBB23_25
4258 ; AVX512VLDQ-NEXT: LBB23_26: ## %else35
4259 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
4260 ; AVX512VLDQ-NEXT: jne LBB23_27
4261 ; AVX512VLDQ-NEXT: LBB23_28: ## %else38
4262 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
4263 ; AVX512VLDQ-NEXT: jne LBB23_29
4264 ; AVX512VLDQ-NEXT: LBB23_30: ## %else41
4265 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
4266 ; AVX512VLDQ-NEXT: jne LBB23_31
4267 ; AVX512VLDQ-NEXT: LBB23_32: ## %else44
4268 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
4269 ; AVX512VLDQ-NEXT: retq
4270 ; AVX512VLDQ-NEXT: LBB23_1: ## %cond.load
4271 ; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
4272 ; AVX512VLDQ-NEXT: testb $2, %al
4273 ; AVX512VLDQ-NEXT: je LBB23_4
4274 ; AVX512VLDQ-NEXT: LBB23_3: ## %cond.load1
4275 ; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4276 ; AVX512VLDQ-NEXT: testb $4, %al
4277 ; AVX512VLDQ-NEXT: je LBB23_6
4278 ; AVX512VLDQ-NEXT: LBB23_5: ## %cond.load4
4279 ; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4280 ; AVX512VLDQ-NEXT: testb $8, %al
4281 ; AVX512VLDQ-NEXT: je LBB23_8
4282 ; AVX512VLDQ-NEXT: LBB23_7: ## %cond.load7
4283 ; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4284 ; AVX512VLDQ-NEXT: testb $16, %al
4285 ; AVX512VLDQ-NEXT: je LBB23_10
4286 ; AVX512VLDQ-NEXT: LBB23_9: ## %cond.load10
4287 ; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4288 ; AVX512VLDQ-NEXT: testb $32, %al
4289 ; AVX512VLDQ-NEXT: je LBB23_12
4290 ; AVX512VLDQ-NEXT: LBB23_11: ## %cond.load13
4291 ; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4292 ; AVX512VLDQ-NEXT: testb $64, %al
4293 ; AVX512VLDQ-NEXT: je LBB23_14
4294 ; AVX512VLDQ-NEXT: LBB23_13: ## %cond.load16
4295 ; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4296 ; AVX512VLDQ-NEXT: testb $-128, %al
4297 ; AVX512VLDQ-NEXT: je LBB23_16
4298 ; AVX512VLDQ-NEXT: LBB23_15: ## %cond.load19
4299 ; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4300 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
4301 ; AVX512VLDQ-NEXT: je LBB23_18
4302 ; AVX512VLDQ-NEXT: LBB23_17: ## %cond.load22
4303 ; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4304 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
4305 ; AVX512VLDQ-NEXT: je LBB23_20
4306 ; AVX512VLDQ-NEXT: LBB23_19: ## %cond.load25
4307 ; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4308 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
4309 ; AVX512VLDQ-NEXT: je LBB23_22
4310 ; AVX512VLDQ-NEXT: LBB23_21: ## %cond.load28
4311 ; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4312 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
4313 ; AVX512VLDQ-NEXT: je LBB23_24
4314 ; AVX512VLDQ-NEXT: LBB23_23: ## %cond.load31
4315 ; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4316 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
4317 ; AVX512VLDQ-NEXT: je LBB23_26
4318 ; AVX512VLDQ-NEXT: LBB23_25: ## %cond.load34
4319 ; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4320 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
4321 ; AVX512VLDQ-NEXT: je LBB23_28
4322 ; AVX512VLDQ-NEXT: LBB23_27: ## %cond.load37
4323 ; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4324 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
4325 ; AVX512VLDQ-NEXT: je LBB23_30
4326 ; AVX512VLDQ-NEXT: LBB23_29: ## %cond.load40
4327 ; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4328 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
4329 ; AVX512VLDQ-NEXT: je LBB23_32
4330 ; AVX512VLDQ-NEXT: LBB23_31: ## %cond.load43
4331 ; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4332 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
4333 ; AVX512VLDQ-NEXT: retq
4335 ; AVX512VLBW-LABEL: load_v16i8_v16i8:
4336 ; AVX512VLBW: ## %bb.0:
4337 ; AVX512VLBW-NEXT: vpmovb2m %xmm0, %k1
4338 ; AVX512VLBW-NEXT: vpblendmb (%rdi), %xmm1, %xmm0 {%k1}
4339 ; AVX512VLBW-NEXT: retq
4340 %mask = icmp slt <16 x i8> %trigger, zeroinitializer
4341 %res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1> %mask, <16 x i8> %dst)
4345 define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i8> %dst) {
4346 ; SSE2-LABEL: load_v32i8_v32i8:
4348 ; SSE2-NEXT: pmovmskb %xmm0, %ecx
4349 ; SSE2-NEXT: pmovmskb %xmm1, %eax
4350 ; SSE2-NEXT: shll $16, %eax
4351 ; SSE2-NEXT: orl %ecx, %eax
4352 ; SSE2-NEXT: testb $1, %al
4353 ; SSE2-NEXT: jne LBB24_1
4354 ; SSE2-NEXT: ## %bb.2: ## %else
4355 ; SSE2-NEXT: testb $2, %al
4356 ; SSE2-NEXT: jne LBB24_3
4357 ; SSE2-NEXT: LBB24_4: ## %else2
4358 ; SSE2-NEXT: testb $4, %al
4359 ; SSE2-NEXT: jne LBB24_5
4360 ; SSE2-NEXT: LBB24_6: ## %else5
4361 ; SSE2-NEXT: testb $8, %al
4362 ; SSE2-NEXT: jne LBB24_7
4363 ; SSE2-NEXT: LBB24_8: ## %else8
4364 ; SSE2-NEXT: testb $16, %al
4365 ; SSE2-NEXT: jne LBB24_9
4366 ; SSE2-NEXT: LBB24_10: ## %else11
4367 ; SSE2-NEXT: testb $32, %al
4368 ; SSE2-NEXT: jne LBB24_11
4369 ; SSE2-NEXT: LBB24_12: ## %else14
4370 ; SSE2-NEXT: testb $64, %al
4371 ; SSE2-NEXT: jne LBB24_13
4372 ; SSE2-NEXT: LBB24_14: ## %else17
4373 ; SSE2-NEXT: testb $-128, %al
4374 ; SSE2-NEXT: jne LBB24_15
4375 ; SSE2-NEXT: LBB24_16: ## %else20
4376 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
4377 ; SSE2-NEXT: jne LBB24_17
4378 ; SSE2-NEXT: LBB24_18: ## %else23
4379 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
4380 ; SSE2-NEXT: jne LBB24_19
4381 ; SSE2-NEXT: LBB24_20: ## %else26
4382 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
4383 ; SSE2-NEXT: jne LBB24_21
4384 ; SSE2-NEXT: LBB24_22: ## %else29
4385 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
4386 ; SSE2-NEXT: jne LBB24_23
4387 ; SSE2-NEXT: LBB24_24: ## %else32
4388 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
4389 ; SSE2-NEXT: jne LBB24_25
4390 ; SSE2-NEXT: LBB24_26: ## %else35
4391 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
4392 ; SSE2-NEXT: jne LBB24_27
4393 ; SSE2-NEXT: LBB24_28: ## %else38
4394 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
4395 ; SSE2-NEXT: jne LBB24_29
4396 ; SSE2-NEXT: LBB24_30: ## %else41
4397 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
4398 ; SSE2-NEXT: jne LBB24_31
4399 ; SSE2-NEXT: LBB24_32: ## %else44
4400 ; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000
4401 ; SSE2-NEXT: jne LBB24_33
4402 ; SSE2-NEXT: LBB24_34: ## %else47
4403 ; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000
4404 ; SSE2-NEXT: jne LBB24_35
4405 ; SSE2-NEXT: LBB24_36: ## %else50
4406 ; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000
4407 ; SSE2-NEXT: jne LBB24_37
4408 ; SSE2-NEXT: LBB24_38: ## %else53
4409 ; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000
4410 ; SSE2-NEXT: jne LBB24_39
4411 ; SSE2-NEXT: LBB24_40: ## %else56
4412 ; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000
4413 ; SSE2-NEXT: jne LBB24_41
4414 ; SSE2-NEXT: LBB24_42: ## %else59
4415 ; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000
4416 ; SSE2-NEXT: jne LBB24_43
4417 ; SSE2-NEXT: LBB24_44: ## %else62
4418 ; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000
4419 ; SSE2-NEXT: jne LBB24_45
4420 ; SSE2-NEXT: LBB24_46: ## %else65
4421 ; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000
4422 ; SSE2-NEXT: jne LBB24_47
4423 ; SSE2-NEXT: LBB24_48: ## %else68
4424 ; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000
4425 ; SSE2-NEXT: jne LBB24_49
4426 ; SSE2-NEXT: LBB24_50: ## %else71
4427 ; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000
4428 ; SSE2-NEXT: jne LBB24_51
4429 ; SSE2-NEXT: LBB24_52: ## %else74
4430 ; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000
4431 ; SSE2-NEXT: jne LBB24_53
4432 ; SSE2-NEXT: LBB24_54: ## %else77
4433 ; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000
4434 ; SSE2-NEXT: jne LBB24_55
4435 ; SSE2-NEXT: LBB24_56: ## %else80
4436 ; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000
4437 ; SSE2-NEXT: jne LBB24_57
4438 ; SSE2-NEXT: LBB24_58: ## %else83
4439 ; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000
4440 ; SSE2-NEXT: jne LBB24_59
4441 ; SSE2-NEXT: LBB24_60: ## %else86
4442 ; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
4443 ; SSE2-NEXT: jne LBB24_61
4444 ; SSE2-NEXT: LBB24_62: ## %else89
4445 ; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
4446 ; SSE2-NEXT: je LBB24_64
4447 ; SSE2-NEXT: LBB24_63: ## %cond.load91
4448 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
4449 ; SSE2-NEXT: movzbl 31(%rdi), %eax
4450 ; SSE2-NEXT: movd %eax, %xmm0
4451 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4452 ; SSE2-NEXT: por %xmm0, %xmm3
4453 ; SSE2-NEXT: LBB24_64: ## %else92
4454 ; SSE2-NEXT: movdqa %xmm2, %xmm0
4455 ; SSE2-NEXT: movdqa %xmm3, %xmm1
4457 ; SSE2-NEXT: LBB24_1: ## %cond.load
4458 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4459 ; SSE2-NEXT: pand %xmm0, %xmm2
4460 ; SSE2-NEXT: movzbl (%rdi), %ecx
4461 ; SSE2-NEXT: movd %ecx, %xmm1
4462 ; SSE2-NEXT: pandn %xmm1, %xmm0
4463 ; SSE2-NEXT: por %xmm0, %xmm2
4464 ; SSE2-NEXT: testb $2, %al
4465 ; SSE2-NEXT: je LBB24_4
4466 ; SSE2-NEXT: LBB24_3: ## %cond.load1
4467 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4468 ; SSE2-NEXT: pand %xmm0, %xmm2
4469 ; SSE2-NEXT: movzbl 1(%rdi), %ecx
4470 ; SSE2-NEXT: movd %ecx, %xmm1
4471 ; SSE2-NEXT: psllw $8, %xmm1
4472 ; SSE2-NEXT: pandn %xmm1, %xmm0
4473 ; SSE2-NEXT: por %xmm0, %xmm2
4474 ; SSE2-NEXT: testb $4, %al
4475 ; SSE2-NEXT: je LBB24_6
4476 ; SSE2-NEXT: LBB24_5: ## %cond.load4
4477 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
4478 ; SSE2-NEXT: pand %xmm0, %xmm2
4479 ; SSE2-NEXT: movzbl 2(%rdi), %ecx
4480 ; SSE2-NEXT: movd %ecx, %xmm1
4481 ; SSE2-NEXT: pslld $16, %xmm1
4482 ; SSE2-NEXT: pandn %xmm1, %xmm0
4483 ; SSE2-NEXT: por %xmm0, %xmm2
4484 ; SSE2-NEXT: testb $8, %al
4485 ; SSE2-NEXT: je LBB24_8
4486 ; SSE2-NEXT: LBB24_7: ## %cond.load7
4487 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
4488 ; SSE2-NEXT: pand %xmm0, %xmm2
4489 ; SSE2-NEXT: movzbl 3(%rdi), %ecx
4490 ; SSE2-NEXT: movd %ecx, %xmm1
4491 ; SSE2-NEXT: pslld $24, %xmm1
4492 ; SSE2-NEXT: pandn %xmm1, %xmm0
4493 ; SSE2-NEXT: por %xmm0, %xmm2
4494 ; SSE2-NEXT: testb $16, %al
4495 ; SSE2-NEXT: je LBB24_10
4496 ; SSE2-NEXT: LBB24_9: ## %cond.load10
4497 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
4498 ; SSE2-NEXT: pand %xmm0, %xmm2
4499 ; SSE2-NEXT: movzbl 4(%rdi), %ecx
4500 ; SSE2-NEXT: movd %ecx, %xmm1
4501 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4502 ; SSE2-NEXT: pandn %xmm1, %xmm0
4503 ; SSE2-NEXT: por %xmm0, %xmm2
4504 ; SSE2-NEXT: testb $32, %al
4505 ; SSE2-NEXT: je LBB24_12
4506 ; SSE2-NEXT: LBB24_11: ## %cond.load13
4507 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
4508 ; SSE2-NEXT: pand %xmm0, %xmm2
4509 ; SSE2-NEXT: movzbl 5(%rdi), %ecx
4510 ; SSE2-NEXT: movd %ecx, %xmm1
4511 ; SSE2-NEXT: psllq $40, %xmm1
4512 ; SSE2-NEXT: pandn %xmm1, %xmm0
4513 ; SSE2-NEXT: por %xmm0, %xmm2
4514 ; SSE2-NEXT: testb $64, %al
4515 ; SSE2-NEXT: je LBB24_14
4516 ; SSE2-NEXT: LBB24_13: ## %cond.load16
4517 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
4518 ; SSE2-NEXT: pand %xmm0, %xmm2
4519 ; SSE2-NEXT: movzbl 6(%rdi), %ecx
4520 ; SSE2-NEXT: movd %ecx, %xmm1
4521 ; SSE2-NEXT: psllq $48, %xmm1
4522 ; SSE2-NEXT: pandn %xmm1, %xmm0
4523 ; SSE2-NEXT: por %xmm0, %xmm2
4524 ; SSE2-NEXT: testb $-128, %al
4525 ; SSE2-NEXT: je LBB24_16
4526 ; SSE2-NEXT: LBB24_15: ## %cond.load19
4527 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
4528 ; SSE2-NEXT: pand %xmm0, %xmm2
4529 ; SSE2-NEXT: movzbl 7(%rdi), %ecx
4530 ; SSE2-NEXT: movd %ecx, %xmm1
4531 ; SSE2-NEXT: psllq $56, %xmm1
4532 ; SSE2-NEXT: pandn %xmm1, %xmm0
4533 ; SSE2-NEXT: por %xmm0, %xmm2
4534 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
4535 ; SSE2-NEXT: je LBB24_18
4536 ; SSE2-NEXT: LBB24_17: ## %cond.load22
4537 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
4538 ; SSE2-NEXT: pand %xmm0, %xmm2
4539 ; SSE2-NEXT: movzbl 8(%rdi), %ecx
4540 ; SSE2-NEXT: movd %ecx, %xmm1
4541 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4542 ; SSE2-NEXT: pandn %xmm1, %xmm0
4543 ; SSE2-NEXT: por %xmm0, %xmm2
4544 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
4545 ; SSE2-NEXT: je LBB24_20
4546 ; SSE2-NEXT: LBB24_19: ## %cond.load25
4547 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
4548 ; SSE2-NEXT: pand %xmm0, %xmm2
4549 ; SSE2-NEXT: movzbl 9(%rdi), %ecx
4550 ; SSE2-NEXT: movd %ecx, %xmm1
4551 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
4552 ; SSE2-NEXT: pandn %xmm1, %xmm0
4553 ; SSE2-NEXT: por %xmm0, %xmm2
4554 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
4555 ; SSE2-NEXT: je LBB24_22
4556 ; SSE2-NEXT: LBB24_21: ## %cond.load28
4557 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
4558 ; SSE2-NEXT: pand %xmm0, %xmm2
4559 ; SSE2-NEXT: movzbl 10(%rdi), %ecx
4560 ; SSE2-NEXT: movd %ecx, %xmm1
4561 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4562 ; SSE2-NEXT: pandn %xmm1, %xmm0
4563 ; SSE2-NEXT: por %xmm0, %xmm2
4564 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
4565 ; SSE2-NEXT: je LBB24_24
4566 ; SSE2-NEXT: LBB24_23: ## %cond.load31
4567 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
4568 ; SSE2-NEXT: pand %xmm0, %xmm2
4569 ; SSE2-NEXT: movzbl 11(%rdi), %ecx
4570 ; SSE2-NEXT: movd %ecx, %xmm1
4571 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
4572 ; SSE2-NEXT: pandn %xmm1, %xmm0
4573 ; SSE2-NEXT: por %xmm0, %xmm2
4574 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
4575 ; SSE2-NEXT: je LBB24_26
4576 ; SSE2-NEXT: LBB24_25: ## %cond.load34
4577 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
4578 ; SSE2-NEXT: pand %xmm0, %xmm2
4579 ; SSE2-NEXT: movzbl 12(%rdi), %ecx
4580 ; SSE2-NEXT: movd %ecx, %xmm1
4581 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4582 ; SSE2-NEXT: pandn %xmm1, %xmm0
4583 ; SSE2-NEXT: por %xmm0, %xmm2
4584 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
4585 ; SSE2-NEXT: je LBB24_28
4586 ; SSE2-NEXT: LBB24_27: ## %cond.load37
4587 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
4588 ; SSE2-NEXT: pand %xmm0, %xmm2
4589 ; SSE2-NEXT: movzbl 13(%rdi), %ecx
4590 ; SSE2-NEXT: movd %ecx, %xmm1
4591 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
4592 ; SSE2-NEXT: pandn %xmm1, %xmm0
4593 ; SSE2-NEXT: por %xmm0, %xmm2
4594 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
4595 ; SSE2-NEXT: je LBB24_30
4596 ; SSE2-NEXT: LBB24_29: ## %cond.load40
4597 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4598 ; SSE2-NEXT: pand %xmm0, %xmm2
4599 ; SSE2-NEXT: movzbl 14(%rdi), %ecx
4600 ; SSE2-NEXT: movd %ecx, %xmm1
4601 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
4602 ; SSE2-NEXT: pandn %xmm1, %xmm0
4603 ; SSE2-NEXT: por %xmm0, %xmm2
4604 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
4605 ; SSE2-NEXT: je LBB24_32
4606 ; SSE2-NEXT: LBB24_31: ## %cond.load43
4607 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
4608 ; SSE2-NEXT: movzbl 15(%rdi), %ecx
4609 ; SSE2-NEXT: movd %ecx, %xmm0
4610 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4611 ; SSE2-NEXT: por %xmm0, %xmm2
4612 ; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000
4613 ; SSE2-NEXT: je LBB24_34
4614 ; SSE2-NEXT: LBB24_33: ## %cond.load46
4615 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4616 ; SSE2-NEXT: pand %xmm0, %xmm3
4617 ; SSE2-NEXT: movzbl 16(%rdi), %ecx
4618 ; SSE2-NEXT: movd %ecx, %xmm1
4619 ; SSE2-NEXT: pandn %xmm1, %xmm0
4620 ; SSE2-NEXT: por %xmm0, %xmm3
4621 ; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000
4622 ; SSE2-NEXT: je LBB24_36
4623 ; SSE2-NEXT: LBB24_35: ## %cond.load49
4624 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4625 ; SSE2-NEXT: pand %xmm0, %xmm3
4626 ; SSE2-NEXT: movzbl 17(%rdi), %ecx
4627 ; SSE2-NEXT: movd %ecx, %xmm1
4628 ; SSE2-NEXT: psllw $8, %xmm1
4629 ; SSE2-NEXT: pandn %xmm1, %xmm0
4630 ; SSE2-NEXT: por %xmm0, %xmm3
4631 ; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000
4632 ; SSE2-NEXT: je LBB24_38
4633 ; SSE2-NEXT: LBB24_37: ## %cond.load52
4634 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
4635 ; SSE2-NEXT: pand %xmm0, %xmm3
4636 ; SSE2-NEXT: movzbl 18(%rdi), %ecx
4637 ; SSE2-NEXT: movd %ecx, %xmm1
4638 ; SSE2-NEXT: pslld $16, %xmm1
4639 ; SSE2-NEXT: pandn %xmm1, %xmm0
4640 ; SSE2-NEXT: por %xmm0, %xmm3
4641 ; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000
4642 ; SSE2-NEXT: je LBB24_40
4643 ; SSE2-NEXT: LBB24_39: ## %cond.load55
4644 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
4645 ; SSE2-NEXT: pand %xmm0, %xmm3
4646 ; SSE2-NEXT: movzbl 19(%rdi), %ecx
4647 ; SSE2-NEXT: movd %ecx, %xmm1
4648 ; SSE2-NEXT: pslld $24, %xmm1
4649 ; SSE2-NEXT: pandn %xmm1, %xmm0
4650 ; SSE2-NEXT: por %xmm0, %xmm3
4651 ; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000
4652 ; SSE2-NEXT: je LBB24_42
4653 ; SSE2-NEXT: LBB24_41: ## %cond.load58
4654 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
4655 ; SSE2-NEXT: pand %xmm0, %xmm3
4656 ; SSE2-NEXT: movzbl 20(%rdi), %ecx
4657 ; SSE2-NEXT: movd %ecx, %xmm1
4658 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4659 ; SSE2-NEXT: pandn %xmm1, %xmm0
4660 ; SSE2-NEXT: por %xmm0, %xmm3
4661 ; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000
4662 ; SSE2-NEXT: je LBB24_44
4663 ; SSE2-NEXT: LBB24_43: ## %cond.load61
4664 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
4665 ; SSE2-NEXT: pand %xmm0, %xmm3
4666 ; SSE2-NEXT: movzbl 21(%rdi), %ecx
4667 ; SSE2-NEXT: movd %ecx, %xmm1
4668 ; SSE2-NEXT: psllq $40, %xmm1
4669 ; SSE2-NEXT: pandn %xmm1, %xmm0
4670 ; SSE2-NEXT: por %xmm0, %xmm3
4671 ; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000
4672 ; SSE2-NEXT: je LBB24_46
4673 ; SSE2-NEXT: LBB24_45: ## %cond.load64
4674 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
4675 ; SSE2-NEXT: pand %xmm0, %xmm3
4676 ; SSE2-NEXT: movzbl 22(%rdi), %ecx
4677 ; SSE2-NEXT: movd %ecx, %xmm1
4678 ; SSE2-NEXT: psllq $48, %xmm1
4679 ; SSE2-NEXT: pandn %xmm1, %xmm0
4680 ; SSE2-NEXT: por %xmm0, %xmm3
4681 ; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000
4682 ; SSE2-NEXT: je LBB24_48
4683 ; SSE2-NEXT: LBB24_47: ## %cond.load67
4684 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
4685 ; SSE2-NEXT: pand %xmm0, %xmm3
4686 ; SSE2-NEXT: movzbl 23(%rdi), %ecx
4687 ; SSE2-NEXT: movd %ecx, %xmm1
4688 ; SSE2-NEXT: psllq $56, %xmm1
4689 ; SSE2-NEXT: pandn %xmm1, %xmm0
4690 ; SSE2-NEXT: por %xmm0, %xmm3
4691 ; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000
4692 ; SSE2-NEXT: je LBB24_50
4693 ; SSE2-NEXT: LBB24_49: ## %cond.load70
4694 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
4695 ; SSE2-NEXT: pand %xmm0, %xmm3
4696 ; SSE2-NEXT: movzbl 24(%rdi), %ecx
4697 ; SSE2-NEXT: movd %ecx, %xmm1
4698 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4699 ; SSE2-NEXT: pandn %xmm1, %xmm0
4700 ; SSE2-NEXT: por %xmm0, %xmm3
4701 ; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000
4702 ; SSE2-NEXT: je LBB24_52
4703 ; SSE2-NEXT: LBB24_51: ## %cond.load73
4704 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
4705 ; SSE2-NEXT: pand %xmm0, %xmm3
4706 ; SSE2-NEXT: movzbl 25(%rdi), %ecx
4707 ; SSE2-NEXT: movd %ecx, %xmm1
4708 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
4709 ; SSE2-NEXT: pandn %xmm1, %xmm0
4710 ; SSE2-NEXT: por %xmm0, %xmm3
4711 ; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000
4712 ; SSE2-NEXT: je LBB24_54
4713 ; SSE2-NEXT: LBB24_53: ## %cond.load76
4714 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
4715 ; SSE2-NEXT: pand %xmm0, %xmm3
4716 ; SSE2-NEXT: movzbl 26(%rdi), %ecx
4717 ; SSE2-NEXT: movd %ecx, %xmm1
4718 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4719 ; SSE2-NEXT: pandn %xmm1, %xmm0
4720 ; SSE2-NEXT: por %xmm0, %xmm3
4721 ; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000
4722 ; SSE2-NEXT: je LBB24_56
4723 ; SSE2-NEXT: LBB24_55: ## %cond.load79
4724 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
4725 ; SSE2-NEXT: pand %xmm0, %xmm3
4726 ; SSE2-NEXT: movzbl 27(%rdi), %ecx
4727 ; SSE2-NEXT: movd %ecx, %xmm1
4728 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
4729 ; SSE2-NEXT: pandn %xmm1, %xmm0
4730 ; SSE2-NEXT: por %xmm0, %xmm3
4731 ; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000
4732 ; SSE2-NEXT: je LBB24_58
4733 ; SSE2-NEXT: LBB24_57: ## %cond.load82
4734 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
4735 ; SSE2-NEXT: pand %xmm0, %xmm3
4736 ; SSE2-NEXT: movzbl 28(%rdi), %ecx
4737 ; SSE2-NEXT: movd %ecx, %xmm1
4738 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4739 ; SSE2-NEXT: pandn %xmm1, %xmm0
4740 ; SSE2-NEXT: por %xmm0, %xmm3
4741 ; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000
4742 ; SSE2-NEXT: je LBB24_60
4743 ; SSE2-NEXT: LBB24_59: ## %cond.load85
4744 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
4745 ; SSE2-NEXT: pand %xmm0, %xmm3
4746 ; SSE2-NEXT: movzbl 29(%rdi), %ecx
4747 ; SSE2-NEXT: movd %ecx, %xmm1
4748 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
4749 ; SSE2-NEXT: pandn %xmm1, %xmm0
4750 ; SSE2-NEXT: por %xmm0, %xmm3
4751 ; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
4752 ; SSE2-NEXT: je LBB24_62
4753 ; SSE2-NEXT: LBB24_61: ## %cond.load88
4754 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4755 ; SSE2-NEXT: pand %xmm0, %xmm3
4756 ; SSE2-NEXT: movzbl 30(%rdi), %ecx
4757 ; SSE2-NEXT: movd %ecx, %xmm1
4758 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
4759 ; SSE2-NEXT: pandn %xmm1, %xmm0
4760 ; SSE2-NEXT: por %xmm0, %xmm3
4761 ; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
4762 ; SSE2-NEXT: jne LBB24_63
4763 ; SSE2-NEXT: jmp LBB24_64
4765 ; SSE42-LABEL: load_v32i8_v32i8:
4767 ; SSE42-NEXT: pmovmskb %xmm0, %ecx
4768 ; SSE42-NEXT: pmovmskb %xmm1, %eax
4769 ; SSE42-NEXT: shll $16, %eax
4770 ; SSE42-NEXT: orl %ecx, %eax
4771 ; SSE42-NEXT: testb $1, %al
4772 ; SSE42-NEXT: jne LBB24_1
4773 ; SSE42-NEXT: ## %bb.2: ## %else
4774 ; SSE42-NEXT: testb $2, %al
4775 ; SSE42-NEXT: jne LBB24_3
4776 ; SSE42-NEXT: LBB24_4: ## %else2
4777 ; SSE42-NEXT: testb $4, %al
4778 ; SSE42-NEXT: jne LBB24_5
4779 ; SSE42-NEXT: LBB24_6: ## %else5
4780 ; SSE42-NEXT: testb $8, %al
4781 ; SSE42-NEXT: jne LBB24_7
4782 ; SSE42-NEXT: LBB24_8: ## %else8
4783 ; SSE42-NEXT: testb $16, %al
4784 ; SSE42-NEXT: jne LBB24_9
4785 ; SSE42-NEXT: LBB24_10: ## %else11
4786 ; SSE42-NEXT: testb $32, %al
4787 ; SSE42-NEXT: jne LBB24_11
4788 ; SSE42-NEXT: LBB24_12: ## %else14
4789 ; SSE42-NEXT: testb $64, %al
4790 ; SSE42-NEXT: jne LBB24_13
4791 ; SSE42-NEXT: LBB24_14: ## %else17
4792 ; SSE42-NEXT: testb $-128, %al
4793 ; SSE42-NEXT: jne LBB24_15
4794 ; SSE42-NEXT: LBB24_16: ## %else20
4795 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
4796 ; SSE42-NEXT: jne LBB24_17
4797 ; SSE42-NEXT: LBB24_18: ## %else23
4798 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
4799 ; SSE42-NEXT: jne LBB24_19
4800 ; SSE42-NEXT: LBB24_20: ## %else26
4801 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
4802 ; SSE42-NEXT: jne LBB24_21
4803 ; SSE42-NEXT: LBB24_22: ## %else29
4804 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
4805 ; SSE42-NEXT: jne LBB24_23
4806 ; SSE42-NEXT: LBB24_24: ## %else32
4807 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
4808 ; SSE42-NEXT: jne LBB24_25
4809 ; SSE42-NEXT: LBB24_26: ## %else35
4810 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
4811 ; SSE42-NEXT: jne LBB24_27
4812 ; SSE42-NEXT: LBB24_28: ## %else38
4813 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
4814 ; SSE42-NEXT: jne LBB24_29
4815 ; SSE42-NEXT: LBB24_30: ## %else41
4816 ; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
4817 ; SSE42-NEXT: jne LBB24_31
4818 ; SSE42-NEXT: LBB24_32: ## %else44
4819 ; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000
4820 ; SSE42-NEXT: jne LBB24_33
4821 ; SSE42-NEXT: LBB24_34: ## %else47
4822 ; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000
4823 ; SSE42-NEXT: jne LBB24_35
4824 ; SSE42-NEXT: LBB24_36: ## %else50
4825 ; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000
4826 ; SSE42-NEXT: jne LBB24_37
4827 ; SSE42-NEXT: LBB24_38: ## %else53
4828 ; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000
4829 ; SSE42-NEXT: jne LBB24_39
4830 ; SSE42-NEXT: LBB24_40: ## %else56
4831 ; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000
4832 ; SSE42-NEXT: jne LBB24_41
4833 ; SSE42-NEXT: LBB24_42: ## %else59
4834 ; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000
4835 ; SSE42-NEXT: jne LBB24_43
4836 ; SSE42-NEXT: LBB24_44: ## %else62
4837 ; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000
4838 ; SSE42-NEXT: jne LBB24_45
4839 ; SSE42-NEXT: LBB24_46: ## %else65
4840 ; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000
4841 ; SSE42-NEXT: jne LBB24_47
4842 ; SSE42-NEXT: LBB24_48: ## %else68
4843 ; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000
4844 ; SSE42-NEXT: jne LBB24_49
4845 ; SSE42-NEXT: LBB24_50: ## %else71
4846 ; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000
4847 ; SSE42-NEXT: jne LBB24_51
4848 ; SSE42-NEXT: LBB24_52: ## %else74
4849 ; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000
4850 ; SSE42-NEXT: jne LBB24_53
4851 ; SSE42-NEXT: LBB24_54: ## %else77
4852 ; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000
4853 ; SSE42-NEXT: jne LBB24_55
4854 ; SSE42-NEXT: LBB24_56: ## %else80
4855 ; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000
4856 ; SSE42-NEXT: jne LBB24_57
4857 ; SSE42-NEXT: LBB24_58: ## %else83
4858 ; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000
4859 ; SSE42-NEXT: jne LBB24_59
4860 ; SSE42-NEXT: LBB24_60: ## %else86
4861 ; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000
4862 ; SSE42-NEXT: jne LBB24_61
4863 ; SSE42-NEXT: LBB24_62: ## %else89
4864 ; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
4865 ; SSE42-NEXT: je LBB24_64
4866 ; SSE42-NEXT: LBB24_63: ## %cond.load91
4867 ; SSE42-NEXT: pinsrb $15, 31(%rdi), %xmm3
4868 ; SSE42-NEXT: LBB24_64: ## %else92
4869 ; SSE42-NEXT: movdqa %xmm2, %xmm0
4870 ; SSE42-NEXT: movdqa %xmm3, %xmm1
4872 ; SSE42-NEXT: LBB24_1: ## %cond.load
4873 ; SSE42-NEXT: pinsrb $0, (%rdi), %xmm2
4874 ; SSE42-NEXT: testb $2, %al
4875 ; SSE42-NEXT: je LBB24_4
4876 ; SSE42-NEXT: LBB24_3: ## %cond.load1
4877 ; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm2
4878 ; SSE42-NEXT: testb $4, %al
4879 ; SSE42-NEXT: je LBB24_6
4880 ; SSE42-NEXT: LBB24_5: ## %cond.load4
4881 ; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm2
4882 ; SSE42-NEXT: testb $8, %al
4883 ; SSE42-NEXT: je LBB24_8
4884 ; SSE42-NEXT: LBB24_7: ## %cond.load7
4885 ; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm2
4886 ; SSE42-NEXT: testb $16, %al
4887 ; SSE42-NEXT: je LBB24_10
4888 ; SSE42-NEXT: LBB24_9: ## %cond.load10
4889 ; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm2
4890 ; SSE42-NEXT: testb $32, %al
4891 ; SSE42-NEXT: je LBB24_12
4892 ; SSE42-NEXT: LBB24_11: ## %cond.load13
4893 ; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm2
4894 ; SSE42-NEXT: testb $64, %al
4895 ; SSE42-NEXT: je LBB24_14
4896 ; SSE42-NEXT: LBB24_13: ## %cond.load16
4897 ; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm2
4898 ; SSE42-NEXT: testb $-128, %al
4899 ; SSE42-NEXT: je LBB24_16
4900 ; SSE42-NEXT: LBB24_15: ## %cond.load19
4901 ; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm2
4902 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
4903 ; SSE42-NEXT: je LBB24_18
4904 ; SSE42-NEXT: LBB24_17: ## %cond.load22
4905 ; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm2
4906 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
4907 ; SSE42-NEXT: je LBB24_20
4908 ; SSE42-NEXT: LBB24_19: ## %cond.load25
4909 ; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm2
4910 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
4911 ; SSE42-NEXT: je LBB24_22
4912 ; SSE42-NEXT: LBB24_21: ## %cond.load28
4913 ; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm2
4914 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
4915 ; SSE42-NEXT: je LBB24_24
4916 ; SSE42-NEXT: LBB24_23: ## %cond.load31
4917 ; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm2
4918 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
4919 ; SSE42-NEXT: je LBB24_26
4920 ; SSE42-NEXT: LBB24_25: ## %cond.load34
4921 ; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm2
4922 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
4923 ; SSE42-NEXT: je LBB24_28
4924 ; SSE42-NEXT: LBB24_27: ## %cond.load37
4925 ; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm2
4926 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
4927 ; SSE42-NEXT: je LBB24_30
4928 ; SSE42-NEXT: LBB24_29: ## %cond.load40
4929 ; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm2
4930 ; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
4931 ; SSE42-NEXT: je LBB24_32
4932 ; SSE42-NEXT: LBB24_31: ## %cond.load43
4933 ; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm2
4934 ; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000
4935 ; SSE42-NEXT: je LBB24_34
4936 ; SSE42-NEXT: LBB24_33: ## %cond.load46
4937 ; SSE42-NEXT: pinsrb $0, 16(%rdi), %xmm3
4938 ; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000
4939 ; SSE42-NEXT: je LBB24_36
4940 ; SSE42-NEXT: LBB24_35: ## %cond.load49
4941 ; SSE42-NEXT: pinsrb $1, 17(%rdi), %xmm3
4942 ; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000
4943 ; SSE42-NEXT: je LBB24_38
4944 ; SSE42-NEXT: LBB24_37: ## %cond.load52
4945 ; SSE42-NEXT: pinsrb $2, 18(%rdi), %xmm3
4946 ; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000
4947 ; SSE42-NEXT: je LBB24_40
4948 ; SSE42-NEXT: LBB24_39: ## %cond.load55
4949 ; SSE42-NEXT: pinsrb $3, 19(%rdi), %xmm3
4950 ; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000
4951 ; SSE42-NEXT: je LBB24_42
4952 ; SSE42-NEXT: LBB24_41: ## %cond.load58
4953 ; SSE42-NEXT: pinsrb $4, 20(%rdi), %xmm3
4954 ; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000
4955 ; SSE42-NEXT: je LBB24_44
4956 ; SSE42-NEXT: LBB24_43: ## %cond.load61
4957 ; SSE42-NEXT: pinsrb $5, 21(%rdi), %xmm3
4958 ; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000
4959 ; SSE42-NEXT: je LBB24_46
4960 ; SSE42-NEXT: LBB24_45: ## %cond.load64
4961 ; SSE42-NEXT: pinsrb $6, 22(%rdi), %xmm3
4962 ; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000
4963 ; SSE42-NEXT: je LBB24_48
4964 ; SSE42-NEXT: LBB24_47: ## %cond.load67
4965 ; SSE42-NEXT: pinsrb $7, 23(%rdi), %xmm3
4966 ; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000
4967 ; SSE42-NEXT: je LBB24_50
4968 ; SSE42-NEXT: LBB24_49: ## %cond.load70
4969 ; SSE42-NEXT: pinsrb $8, 24(%rdi), %xmm3
4970 ; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000
4971 ; SSE42-NEXT: je LBB24_52
4972 ; SSE42-NEXT: LBB24_51: ## %cond.load73
4973 ; SSE42-NEXT: pinsrb $9, 25(%rdi), %xmm3
4974 ; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000
4975 ; SSE42-NEXT: je LBB24_54
4976 ; SSE42-NEXT: LBB24_53: ## %cond.load76
4977 ; SSE42-NEXT: pinsrb $10, 26(%rdi), %xmm3
4978 ; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000
4979 ; SSE42-NEXT: je LBB24_56
4980 ; SSE42-NEXT: LBB24_55: ## %cond.load79
4981 ; SSE42-NEXT: pinsrb $11, 27(%rdi), %xmm3
4982 ; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000
4983 ; SSE42-NEXT: je LBB24_58
4984 ; SSE42-NEXT: LBB24_57: ## %cond.load82
4985 ; SSE42-NEXT: pinsrb $12, 28(%rdi), %xmm3
4986 ; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000
4987 ; SSE42-NEXT: je LBB24_60
4988 ; SSE42-NEXT: LBB24_59: ## %cond.load85
4989 ; SSE42-NEXT: pinsrb $13, 29(%rdi), %xmm3
4990 ; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000
4991 ; SSE42-NEXT: je LBB24_62
4992 ; SSE42-NEXT: LBB24_61: ## %cond.load88
4993 ; SSE42-NEXT: pinsrb $14, 30(%rdi), %xmm3
4994 ; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
4995 ; SSE42-NEXT: jne LBB24_63
4996 ; SSE42-NEXT: jmp LBB24_64
4998 ; AVX1-LABEL: load_v32i8_v32i8:
5000 ; AVX1-NEXT: vpmovmskb %xmm0, %ecx
5001 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
5002 ; AVX1-NEXT: vpmovmskb %xmm0, %eax
5003 ; AVX1-NEXT: shll $16, %eax
5004 ; AVX1-NEXT: orl %ecx, %eax
5005 ; AVX1-NEXT: testb $1, %al
5006 ; AVX1-NEXT: jne LBB24_1
5007 ; AVX1-NEXT: ## %bb.2: ## %else
5008 ; AVX1-NEXT: testb $2, %al
5009 ; AVX1-NEXT: jne LBB24_3
5010 ; AVX1-NEXT: LBB24_4: ## %else2
5011 ; AVX1-NEXT: testb $4, %al
5012 ; AVX1-NEXT: jne LBB24_5
5013 ; AVX1-NEXT: LBB24_6: ## %else5
5014 ; AVX1-NEXT: testb $8, %al
5015 ; AVX1-NEXT: jne LBB24_7
5016 ; AVX1-NEXT: LBB24_8: ## %else8
5017 ; AVX1-NEXT: testb $16, %al
5018 ; AVX1-NEXT: jne LBB24_9
5019 ; AVX1-NEXT: LBB24_10: ## %else11
5020 ; AVX1-NEXT: testb $32, %al
5021 ; AVX1-NEXT: jne LBB24_11
5022 ; AVX1-NEXT: LBB24_12: ## %else14
5023 ; AVX1-NEXT: testb $64, %al
5024 ; AVX1-NEXT: jne LBB24_13
5025 ; AVX1-NEXT: LBB24_14: ## %else17
5026 ; AVX1-NEXT: testb $-128, %al
5027 ; AVX1-NEXT: jne LBB24_15
5028 ; AVX1-NEXT: LBB24_16: ## %else20
5029 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
5030 ; AVX1-NEXT: jne LBB24_17
5031 ; AVX1-NEXT: LBB24_18: ## %else23
5032 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
5033 ; AVX1-NEXT: jne LBB24_19
5034 ; AVX1-NEXT: LBB24_20: ## %else26
5035 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
5036 ; AVX1-NEXT: jne LBB24_21
5037 ; AVX1-NEXT: LBB24_22: ## %else29
5038 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
5039 ; AVX1-NEXT: jne LBB24_23
5040 ; AVX1-NEXT: LBB24_24: ## %else32
5041 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
5042 ; AVX1-NEXT: jne LBB24_25
5043 ; AVX1-NEXT: LBB24_26: ## %else35
5044 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
5045 ; AVX1-NEXT: jne LBB24_27
5046 ; AVX1-NEXT: LBB24_28: ## %else38
5047 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
5048 ; AVX1-NEXT: jne LBB24_29
5049 ; AVX1-NEXT: LBB24_30: ## %else41
5050 ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
5051 ; AVX1-NEXT: jne LBB24_31
5052 ; AVX1-NEXT: LBB24_32: ## %else44
5053 ; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
5054 ; AVX1-NEXT: jne LBB24_33
5055 ; AVX1-NEXT: LBB24_34: ## %else47
5056 ; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
5057 ; AVX1-NEXT: jne LBB24_35
5058 ; AVX1-NEXT: LBB24_36: ## %else50
5059 ; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000
5060 ; AVX1-NEXT: jne LBB24_37
5061 ; AVX1-NEXT: LBB24_38: ## %else53
5062 ; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000
5063 ; AVX1-NEXT: jne LBB24_39
5064 ; AVX1-NEXT: LBB24_40: ## %else56
5065 ; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
5066 ; AVX1-NEXT: jne LBB24_41
5067 ; AVX1-NEXT: LBB24_42: ## %else59
5068 ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000
5069 ; AVX1-NEXT: jne LBB24_43
5070 ; AVX1-NEXT: LBB24_44: ## %else62
5071 ; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
5072 ; AVX1-NEXT: jne LBB24_45
5073 ; AVX1-NEXT: LBB24_46: ## %else65
5074 ; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
5075 ; AVX1-NEXT: jne LBB24_47
5076 ; AVX1-NEXT: LBB24_48: ## %else68
5077 ; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
5078 ; AVX1-NEXT: jne LBB24_49
5079 ; AVX1-NEXT: LBB24_50: ## %else71
5080 ; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
5081 ; AVX1-NEXT: jne LBB24_51
5082 ; AVX1-NEXT: LBB24_52: ## %else74
5083 ; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000
5084 ; AVX1-NEXT: jne LBB24_53
5085 ; AVX1-NEXT: LBB24_54: ## %else77
5086 ; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000
5087 ; AVX1-NEXT: jne LBB24_55
5088 ; AVX1-NEXT: LBB24_56: ## %else80
5089 ; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
5090 ; AVX1-NEXT: jne LBB24_57
5091 ; AVX1-NEXT: LBB24_58: ## %else83
5092 ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000
5093 ; AVX1-NEXT: jne LBB24_59
5094 ; AVX1-NEXT: LBB24_60: ## %else86
5095 ; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5096 ; AVX1-NEXT: jne LBB24_61
5097 ; AVX1-NEXT: LBB24_62: ## %else89
5098 ; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5099 ; AVX1-NEXT: jne LBB24_63
5100 ; AVX1-NEXT: LBB24_64: ## %else92
5101 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
5103 ; AVX1-NEXT: LBB24_1: ## %cond.load
5104 ; AVX1-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
5105 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5106 ; AVX1-NEXT: testb $2, %al
5107 ; AVX1-NEXT: je LBB24_4
5108 ; AVX1-NEXT: LBB24_3: ## %cond.load1
5109 ; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5110 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5111 ; AVX1-NEXT: testb $4, %al
5112 ; AVX1-NEXT: je LBB24_6
5113 ; AVX1-NEXT: LBB24_5: ## %cond.load4
5114 ; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5115 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5116 ; AVX1-NEXT: testb $8, %al
5117 ; AVX1-NEXT: je LBB24_8
5118 ; AVX1-NEXT: LBB24_7: ## %cond.load7
5119 ; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5120 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5121 ; AVX1-NEXT: testb $16, %al
5122 ; AVX1-NEXT: je LBB24_10
5123 ; AVX1-NEXT: LBB24_9: ## %cond.load10
5124 ; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5125 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5126 ; AVX1-NEXT: testb $32, %al
5127 ; AVX1-NEXT: je LBB24_12
5128 ; AVX1-NEXT: LBB24_11: ## %cond.load13
5129 ; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5130 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5131 ; AVX1-NEXT: testb $64, %al
5132 ; AVX1-NEXT: je LBB24_14
5133 ; AVX1-NEXT: LBB24_13: ## %cond.load16
5134 ; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5135 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5136 ; AVX1-NEXT: testb $-128, %al
5137 ; AVX1-NEXT: je LBB24_16
5138 ; AVX1-NEXT: LBB24_15: ## %cond.load19
5139 ; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5140 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5141 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
5142 ; AVX1-NEXT: je LBB24_18
5143 ; AVX1-NEXT: LBB24_17: ## %cond.load22
5144 ; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5145 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5146 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
5147 ; AVX1-NEXT: je LBB24_20
5148 ; AVX1-NEXT: LBB24_19: ## %cond.load25
5149 ; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5150 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5151 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
5152 ; AVX1-NEXT: je LBB24_22
5153 ; AVX1-NEXT: LBB24_21: ## %cond.load28
5154 ; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5155 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5156 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
5157 ; AVX1-NEXT: je LBB24_24
5158 ; AVX1-NEXT: LBB24_23: ## %cond.load31
5159 ; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5160 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5161 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
5162 ; AVX1-NEXT: je LBB24_26
5163 ; AVX1-NEXT: LBB24_25: ## %cond.load34
5164 ; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5165 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5166 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
5167 ; AVX1-NEXT: je LBB24_28
5168 ; AVX1-NEXT: LBB24_27: ## %cond.load37
5169 ; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5170 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5171 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
5172 ; AVX1-NEXT: je LBB24_30
5173 ; AVX1-NEXT: LBB24_29: ## %cond.load40
5174 ; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5175 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5176 ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
5177 ; AVX1-NEXT: je LBB24_32
5178 ; AVX1-NEXT: LBB24_31: ## %cond.load43
5179 ; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5180 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5181 ; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
5182 ; AVX1-NEXT: je LBB24_34
5183 ; AVX1-NEXT: LBB24_33: ## %cond.load46
5184 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5185 ; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5186 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5187 ; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
5188 ; AVX1-NEXT: je LBB24_36
5189 ; AVX1-NEXT: LBB24_35: ## %cond.load49
5190 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5191 ; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5192 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5193 ; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000
5194 ; AVX1-NEXT: je LBB24_38
5195 ; AVX1-NEXT: LBB24_37: ## %cond.load52
5196 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5197 ; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5198 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5199 ; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000
5200 ; AVX1-NEXT: je LBB24_40
5201 ; AVX1-NEXT: LBB24_39: ## %cond.load55
5202 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5203 ; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5204 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5205 ; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
5206 ; AVX1-NEXT: je LBB24_42
5207 ; AVX1-NEXT: LBB24_41: ## %cond.load58
5208 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5209 ; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5210 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5211 ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000
5212 ; AVX1-NEXT: je LBB24_44
5213 ; AVX1-NEXT: LBB24_43: ## %cond.load61
5214 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5215 ; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5216 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5217 ; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
5218 ; AVX1-NEXT: je LBB24_46
5219 ; AVX1-NEXT: LBB24_45: ## %cond.load64
5220 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5221 ; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5222 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5223 ; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
5224 ; AVX1-NEXT: je LBB24_48
5225 ; AVX1-NEXT: LBB24_47: ## %cond.load67
5226 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5227 ; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5228 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5229 ; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
5230 ; AVX1-NEXT: je LBB24_50
5231 ; AVX1-NEXT: LBB24_49: ## %cond.load70
5232 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5233 ; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5234 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5235 ; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
5236 ; AVX1-NEXT: je LBB24_52
5237 ; AVX1-NEXT: LBB24_51: ## %cond.load73
5238 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5239 ; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5240 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5241 ; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000
5242 ; AVX1-NEXT: je LBB24_54
5243 ; AVX1-NEXT: LBB24_53: ## %cond.load76
5244 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5245 ; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5246 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5247 ; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000
5248 ; AVX1-NEXT: je LBB24_56
5249 ; AVX1-NEXT: LBB24_55: ## %cond.load79
5250 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5251 ; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5252 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5253 ; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
5254 ; AVX1-NEXT: je LBB24_58
5255 ; AVX1-NEXT: LBB24_57: ## %cond.load82
5256 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5257 ; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5258 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5259 ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000
5260 ; AVX1-NEXT: je LBB24_60
5261 ; AVX1-NEXT: LBB24_59: ## %cond.load85
5262 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5263 ; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5264 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5265 ; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5266 ; AVX1-NEXT: je LBB24_62
5267 ; AVX1-NEXT: LBB24_61: ## %cond.load88
5268 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5269 ; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5270 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5271 ; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5272 ; AVX1-NEXT: je LBB24_64
5273 ; AVX1-NEXT: LBB24_63: ## %cond.load91
5274 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5275 ; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
5276 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5277 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
5280 ; AVX2-LABEL: load_v32i8_v32i8:
5282 ; AVX2-NEXT: vpmovmskb %ymm0, %eax
5283 ; AVX2-NEXT: testb $1, %al
5284 ; AVX2-NEXT: jne LBB24_1
5285 ; AVX2-NEXT: ## %bb.2: ## %else
5286 ; AVX2-NEXT: testb $2, %al
5287 ; AVX2-NEXT: jne LBB24_3
5288 ; AVX2-NEXT: LBB24_4: ## %else2
5289 ; AVX2-NEXT: testb $4, %al
5290 ; AVX2-NEXT: jne LBB24_5
5291 ; AVX2-NEXT: LBB24_6: ## %else5
5292 ; AVX2-NEXT: testb $8, %al
5293 ; AVX2-NEXT: jne LBB24_7
5294 ; AVX2-NEXT: LBB24_8: ## %else8
5295 ; AVX2-NEXT: testb $16, %al
5296 ; AVX2-NEXT: jne LBB24_9
5297 ; AVX2-NEXT: LBB24_10: ## %else11
5298 ; AVX2-NEXT: testb $32, %al
5299 ; AVX2-NEXT: jne LBB24_11
5300 ; AVX2-NEXT: LBB24_12: ## %else14
5301 ; AVX2-NEXT: testb $64, %al
5302 ; AVX2-NEXT: jne LBB24_13
5303 ; AVX2-NEXT: LBB24_14: ## %else17
5304 ; AVX2-NEXT: testb $-128, %al
5305 ; AVX2-NEXT: jne LBB24_15
5306 ; AVX2-NEXT: LBB24_16: ## %else20
5307 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
5308 ; AVX2-NEXT: jne LBB24_17
5309 ; AVX2-NEXT: LBB24_18: ## %else23
5310 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
5311 ; AVX2-NEXT: jne LBB24_19
5312 ; AVX2-NEXT: LBB24_20: ## %else26
5313 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
5314 ; AVX2-NEXT: jne LBB24_21
5315 ; AVX2-NEXT: LBB24_22: ## %else29
5316 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
5317 ; AVX2-NEXT: jne LBB24_23
5318 ; AVX2-NEXT: LBB24_24: ## %else32
5319 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
5320 ; AVX2-NEXT: jne LBB24_25
5321 ; AVX2-NEXT: LBB24_26: ## %else35
5322 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
5323 ; AVX2-NEXT: jne LBB24_27
5324 ; AVX2-NEXT: LBB24_28: ## %else38
5325 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
5326 ; AVX2-NEXT: jne LBB24_29
5327 ; AVX2-NEXT: LBB24_30: ## %else41
5328 ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
5329 ; AVX2-NEXT: jne LBB24_31
5330 ; AVX2-NEXT: LBB24_32: ## %else44
5331 ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
5332 ; AVX2-NEXT: jne LBB24_33
5333 ; AVX2-NEXT: LBB24_34: ## %else47
5334 ; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000
5335 ; AVX2-NEXT: jne LBB24_35
5336 ; AVX2-NEXT: LBB24_36: ## %else50
5337 ; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000
5338 ; AVX2-NEXT: jne LBB24_37
5339 ; AVX2-NEXT: LBB24_38: ## %else53
5340 ; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000
5341 ; AVX2-NEXT: jne LBB24_39
5342 ; AVX2-NEXT: LBB24_40: ## %else56
5343 ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
5344 ; AVX2-NEXT: jne LBB24_41
5345 ; AVX2-NEXT: LBB24_42: ## %else59
5346 ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000
5347 ; AVX2-NEXT: jne LBB24_43
5348 ; AVX2-NEXT: LBB24_44: ## %else62
5349 ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
5350 ; AVX2-NEXT: jne LBB24_45
5351 ; AVX2-NEXT: LBB24_46: ## %else65
5352 ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
5353 ; AVX2-NEXT: jne LBB24_47
5354 ; AVX2-NEXT: LBB24_48: ## %else68
5355 ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
5356 ; AVX2-NEXT: jne LBB24_49
5357 ; AVX2-NEXT: LBB24_50: ## %else71
5358 ; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000
5359 ; AVX2-NEXT: jne LBB24_51
5360 ; AVX2-NEXT: LBB24_52: ## %else74
5361 ; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000
5362 ; AVX2-NEXT: jne LBB24_53
5363 ; AVX2-NEXT: LBB24_54: ## %else77
5364 ; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000
5365 ; AVX2-NEXT: jne LBB24_55
5366 ; AVX2-NEXT: LBB24_56: ## %else80
5367 ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
5368 ; AVX2-NEXT: jne LBB24_57
5369 ; AVX2-NEXT: LBB24_58: ## %else83
5370 ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000
5371 ; AVX2-NEXT: jne LBB24_59
5372 ; AVX2-NEXT: LBB24_60: ## %else86
5373 ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5374 ; AVX2-NEXT: jne LBB24_61
5375 ; AVX2-NEXT: LBB24_62: ## %else89
5376 ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5377 ; AVX2-NEXT: jne LBB24_63
5378 ; AVX2-NEXT: LBB24_64: ## %else92
5379 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
5381 ; AVX2-NEXT: LBB24_1: ## %cond.load
5382 ; AVX2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
5383 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5384 ; AVX2-NEXT: testb $2, %al
5385 ; AVX2-NEXT: je LBB24_4
5386 ; AVX2-NEXT: LBB24_3: ## %cond.load1
5387 ; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5388 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5389 ; AVX2-NEXT: testb $4, %al
5390 ; AVX2-NEXT: je LBB24_6
5391 ; AVX2-NEXT: LBB24_5: ## %cond.load4
5392 ; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5393 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5394 ; AVX2-NEXT: testb $8, %al
5395 ; AVX2-NEXT: je LBB24_8
5396 ; AVX2-NEXT: LBB24_7: ## %cond.load7
5397 ; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5398 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5399 ; AVX2-NEXT: testb $16, %al
5400 ; AVX2-NEXT: je LBB24_10
5401 ; AVX2-NEXT: LBB24_9: ## %cond.load10
5402 ; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5403 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5404 ; AVX2-NEXT: testb $32, %al
5405 ; AVX2-NEXT: je LBB24_12
5406 ; AVX2-NEXT: LBB24_11: ## %cond.load13
5407 ; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5408 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5409 ; AVX2-NEXT: testb $64, %al
5410 ; AVX2-NEXT: je LBB24_14
5411 ; AVX2-NEXT: LBB24_13: ## %cond.load16
5412 ; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5413 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5414 ; AVX2-NEXT: testb $-128, %al
5415 ; AVX2-NEXT: je LBB24_16
5416 ; AVX2-NEXT: LBB24_15: ## %cond.load19
5417 ; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5418 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5419 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
5420 ; AVX2-NEXT: je LBB24_18
5421 ; AVX2-NEXT: LBB24_17: ## %cond.load22
5422 ; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5423 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5424 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
5425 ; AVX2-NEXT: je LBB24_20
5426 ; AVX2-NEXT: LBB24_19: ## %cond.load25
5427 ; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5428 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5429 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
5430 ; AVX2-NEXT: je LBB24_22
5431 ; AVX2-NEXT: LBB24_21: ## %cond.load28
5432 ; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5433 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5434 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
5435 ; AVX2-NEXT: je LBB24_24
5436 ; AVX2-NEXT: LBB24_23: ## %cond.load31
5437 ; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5438 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5439 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
5440 ; AVX2-NEXT: je LBB24_26
5441 ; AVX2-NEXT: LBB24_25: ## %cond.load34
5442 ; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5443 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5444 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
5445 ; AVX2-NEXT: je LBB24_28
5446 ; AVX2-NEXT: LBB24_27: ## %cond.load37
5447 ; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5448 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5449 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
5450 ; AVX2-NEXT: je LBB24_30
5451 ; AVX2-NEXT: LBB24_29: ## %cond.load40
5452 ; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5453 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5454 ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
5455 ; AVX2-NEXT: je LBB24_32
5456 ; AVX2-NEXT: LBB24_31: ## %cond.load43
5457 ; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5458 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5459 ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
5460 ; AVX2-NEXT: je LBB24_34
5461 ; AVX2-NEXT: LBB24_33: ## %cond.load46
5462 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5463 ; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5464 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5465 ; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000
5466 ; AVX2-NEXT: je LBB24_36
5467 ; AVX2-NEXT: LBB24_35: ## %cond.load49
5468 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5469 ; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5470 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5471 ; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000
5472 ; AVX2-NEXT: je LBB24_38
5473 ; AVX2-NEXT: LBB24_37: ## %cond.load52
5474 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5475 ; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5476 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5477 ; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000
5478 ; AVX2-NEXT: je LBB24_40
5479 ; AVX2-NEXT: LBB24_39: ## %cond.load55
5480 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5481 ; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5482 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5483 ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
5484 ; AVX2-NEXT: je LBB24_42
5485 ; AVX2-NEXT: LBB24_41: ## %cond.load58
5486 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5487 ; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5488 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5489 ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000
5490 ; AVX2-NEXT: je LBB24_44
5491 ; AVX2-NEXT: LBB24_43: ## %cond.load61
5492 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5493 ; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5494 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5495 ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
5496 ; AVX2-NEXT: je LBB24_46
5497 ; AVX2-NEXT: LBB24_45: ## %cond.load64
5498 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5499 ; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5500 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5501 ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
5502 ; AVX2-NEXT: je LBB24_48
5503 ; AVX2-NEXT: LBB24_47: ## %cond.load67
5504 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5505 ; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5506 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5507 ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
5508 ; AVX2-NEXT: je LBB24_50
5509 ; AVX2-NEXT: LBB24_49: ## %cond.load70
5510 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5511 ; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5512 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5513 ; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000
5514 ; AVX2-NEXT: je LBB24_52
5515 ; AVX2-NEXT: LBB24_51: ## %cond.load73
5516 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5517 ; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5518 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5519 ; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000
5520 ; AVX2-NEXT: je LBB24_54
5521 ; AVX2-NEXT: LBB24_53: ## %cond.load76
5522 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5523 ; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5524 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5525 ; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000
5526 ; AVX2-NEXT: je LBB24_56
5527 ; AVX2-NEXT: LBB24_55: ## %cond.load79
5528 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5529 ; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5530 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5531 ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
5532 ; AVX2-NEXT: je LBB24_58
5533 ; AVX2-NEXT: LBB24_57: ## %cond.load82
5534 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5535 ; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5536 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5537 ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000
5538 ; AVX2-NEXT: je LBB24_60
5539 ; AVX2-NEXT: LBB24_59: ## %cond.load85
5540 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5541 ; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5542 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5543 ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5544 ; AVX2-NEXT: je LBB24_62
5545 ; AVX2-NEXT: LBB24_61: ## %cond.load88
5546 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5547 ; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5548 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5549 ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5550 ; AVX2-NEXT: je LBB24_64
5551 ; AVX2-NEXT: LBB24_63: ## %cond.load91
5552 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5553 ; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
5554 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5555 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
5558 ; AVX512F-LABEL: load_v32i8_v32i8:
5559 ; AVX512F: ## %bb.0:
5560 ; AVX512F-NEXT: vpmovmskb %ymm0, %eax
5561 ; AVX512F-NEXT: testb $1, %al
5562 ; AVX512F-NEXT: jne LBB24_1
5563 ; AVX512F-NEXT: ## %bb.2: ## %else
5564 ; AVX512F-NEXT: testb $2, %al
5565 ; AVX512F-NEXT: jne LBB24_3
5566 ; AVX512F-NEXT: LBB24_4: ## %else2
5567 ; AVX512F-NEXT: testb $4, %al
5568 ; AVX512F-NEXT: jne LBB24_5
5569 ; AVX512F-NEXT: LBB24_6: ## %else5
5570 ; AVX512F-NEXT: testb $8, %al
5571 ; AVX512F-NEXT: jne LBB24_7
5572 ; AVX512F-NEXT: LBB24_8: ## %else8
5573 ; AVX512F-NEXT: testb $16, %al
5574 ; AVX512F-NEXT: jne LBB24_9
5575 ; AVX512F-NEXT: LBB24_10: ## %else11
5576 ; AVX512F-NEXT: testb $32, %al
5577 ; AVX512F-NEXT: jne LBB24_11
5578 ; AVX512F-NEXT: LBB24_12: ## %else14
5579 ; AVX512F-NEXT: testb $64, %al
5580 ; AVX512F-NEXT: jne LBB24_13
5581 ; AVX512F-NEXT: LBB24_14: ## %else17
5582 ; AVX512F-NEXT: testb $-128, %al
5583 ; AVX512F-NEXT: jne LBB24_15
5584 ; AVX512F-NEXT: LBB24_16: ## %else20
5585 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
5586 ; AVX512F-NEXT: jne LBB24_17
5587 ; AVX512F-NEXT: LBB24_18: ## %else23
5588 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
5589 ; AVX512F-NEXT: jne LBB24_19
5590 ; AVX512F-NEXT: LBB24_20: ## %else26
5591 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
5592 ; AVX512F-NEXT: jne LBB24_21
5593 ; AVX512F-NEXT: LBB24_22: ## %else29
5594 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
5595 ; AVX512F-NEXT: jne LBB24_23
5596 ; AVX512F-NEXT: LBB24_24: ## %else32
5597 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
5598 ; AVX512F-NEXT: jne LBB24_25
5599 ; AVX512F-NEXT: LBB24_26: ## %else35
5600 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
5601 ; AVX512F-NEXT: jne LBB24_27
5602 ; AVX512F-NEXT: LBB24_28: ## %else38
5603 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
5604 ; AVX512F-NEXT: jne LBB24_29
5605 ; AVX512F-NEXT: LBB24_30: ## %else41
5606 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
5607 ; AVX512F-NEXT: jne LBB24_31
5608 ; AVX512F-NEXT: LBB24_32: ## %else44
5609 ; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000
5610 ; AVX512F-NEXT: jne LBB24_33
5611 ; AVX512F-NEXT: LBB24_34: ## %else47
5612 ; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000
5613 ; AVX512F-NEXT: jne LBB24_35
5614 ; AVX512F-NEXT: LBB24_36: ## %else50
5615 ; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000
5616 ; AVX512F-NEXT: jne LBB24_37
5617 ; AVX512F-NEXT: LBB24_38: ## %else53
5618 ; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000
5619 ; AVX512F-NEXT: jne LBB24_39
5620 ; AVX512F-NEXT: LBB24_40: ## %else56
5621 ; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000
5622 ; AVX512F-NEXT: jne LBB24_41
5623 ; AVX512F-NEXT: LBB24_42: ## %else59
5624 ; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000
5625 ; AVX512F-NEXT: jne LBB24_43
5626 ; AVX512F-NEXT: LBB24_44: ## %else62
5627 ; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000
5628 ; AVX512F-NEXT: jne LBB24_45
5629 ; AVX512F-NEXT: LBB24_46: ## %else65
5630 ; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000
5631 ; AVX512F-NEXT: jne LBB24_47
5632 ; AVX512F-NEXT: LBB24_48: ## %else68
5633 ; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000
5634 ; AVX512F-NEXT: jne LBB24_49
5635 ; AVX512F-NEXT: LBB24_50: ## %else71
5636 ; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000
5637 ; AVX512F-NEXT: jne LBB24_51
5638 ; AVX512F-NEXT: LBB24_52: ## %else74
5639 ; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000
5640 ; AVX512F-NEXT: jne LBB24_53
5641 ; AVX512F-NEXT: LBB24_54: ## %else77
5642 ; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000
5643 ; AVX512F-NEXT: jne LBB24_55
5644 ; AVX512F-NEXT: LBB24_56: ## %else80
5645 ; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000
5646 ; AVX512F-NEXT: jne LBB24_57
5647 ; AVX512F-NEXT: LBB24_58: ## %else83
5648 ; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000
5649 ; AVX512F-NEXT: jne LBB24_59
5650 ; AVX512F-NEXT: LBB24_60: ## %else86
5651 ; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5652 ; AVX512F-NEXT: jne LBB24_61
5653 ; AVX512F-NEXT: LBB24_62: ## %else89
5654 ; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5655 ; AVX512F-NEXT: jne LBB24_63
5656 ; AVX512F-NEXT: LBB24_64: ## %else92
5657 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
5658 ; AVX512F-NEXT: retq
5659 ; AVX512F-NEXT: LBB24_1: ## %cond.load
5660 ; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
5661 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5662 ; AVX512F-NEXT: testb $2, %al
5663 ; AVX512F-NEXT: je LBB24_4
5664 ; AVX512F-NEXT: LBB24_3: ## %cond.load1
5665 ; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5666 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5667 ; AVX512F-NEXT: testb $4, %al
5668 ; AVX512F-NEXT: je LBB24_6
5669 ; AVX512F-NEXT: LBB24_5: ## %cond.load4
5670 ; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5671 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5672 ; AVX512F-NEXT: testb $8, %al
5673 ; AVX512F-NEXT: je LBB24_8
5674 ; AVX512F-NEXT: LBB24_7: ## %cond.load7
5675 ; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5676 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5677 ; AVX512F-NEXT: testb $16, %al
5678 ; AVX512F-NEXT: je LBB24_10
5679 ; AVX512F-NEXT: LBB24_9: ## %cond.load10
5680 ; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5681 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5682 ; AVX512F-NEXT: testb $32, %al
5683 ; AVX512F-NEXT: je LBB24_12
5684 ; AVX512F-NEXT: LBB24_11: ## %cond.load13
5685 ; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5686 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5687 ; AVX512F-NEXT: testb $64, %al
5688 ; AVX512F-NEXT: je LBB24_14
5689 ; AVX512F-NEXT: LBB24_13: ## %cond.load16
5690 ; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5691 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5692 ; AVX512F-NEXT: testb $-128, %al
5693 ; AVX512F-NEXT: je LBB24_16
5694 ; AVX512F-NEXT: LBB24_15: ## %cond.load19
5695 ; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5696 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5697 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
5698 ; AVX512F-NEXT: je LBB24_18
5699 ; AVX512F-NEXT: LBB24_17: ## %cond.load22
5700 ; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5701 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5702 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
5703 ; AVX512F-NEXT: je LBB24_20
5704 ; AVX512F-NEXT: LBB24_19: ## %cond.load25
5705 ; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5706 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5707 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
5708 ; AVX512F-NEXT: je LBB24_22
5709 ; AVX512F-NEXT: LBB24_21: ## %cond.load28
5710 ; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5711 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5712 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
5713 ; AVX512F-NEXT: je LBB24_24
5714 ; AVX512F-NEXT: LBB24_23: ## %cond.load31
5715 ; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5716 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5717 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
5718 ; AVX512F-NEXT: je LBB24_26
5719 ; AVX512F-NEXT: LBB24_25: ## %cond.load34
5720 ; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5721 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5722 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
5723 ; AVX512F-NEXT: je LBB24_28
5724 ; AVX512F-NEXT: LBB24_27: ## %cond.load37
5725 ; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5726 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5727 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
5728 ; AVX512F-NEXT: je LBB24_30
5729 ; AVX512F-NEXT: LBB24_29: ## %cond.load40
5730 ; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5731 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5732 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
5733 ; AVX512F-NEXT: je LBB24_32
5734 ; AVX512F-NEXT: LBB24_31: ## %cond.load43
5735 ; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5736 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5737 ; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000
5738 ; AVX512F-NEXT: je LBB24_34
5739 ; AVX512F-NEXT: LBB24_33: ## %cond.load46
5740 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5741 ; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5742 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5743 ; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000
5744 ; AVX512F-NEXT: je LBB24_36
5745 ; AVX512F-NEXT: LBB24_35: ## %cond.load49
5746 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5747 ; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5748 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5749 ; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000
5750 ; AVX512F-NEXT: je LBB24_38
5751 ; AVX512F-NEXT: LBB24_37: ## %cond.load52
5752 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5753 ; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5754 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5755 ; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000
5756 ; AVX512F-NEXT: je LBB24_40
5757 ; AVX512F-NEXT: LBB24_39: ## %cond.load55
5758 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5759 ; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5760 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5761 ; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000
5762 ; AVX512F-NEXT: je LBB24_42
5763 ; AVX512F-NEXT: LBB24_41: ## %cond.load58
5764 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5765 ; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5766 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5767 ; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000
5768 ; AVX512F-NEXT: je LBB24_44
5769 ; AVX512F-NEXT: LBB24_43: ## %cond.load61
5770 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5771 ; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5772 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5773 ; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000
5774 ; AVX512F-NEXT: je LBB24_46
5775 ; AVX512F-NEXT: LBB24_45: ## %cond.load64
5776 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5777 ; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5778 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5779 ; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000
5780 ; AVX512F-NEXT: je LBB24_48
5781 ; AVX512F-NEXT: LBB24_47: ## %cond.load67
5782 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5783 ; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5784 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5785 ; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000
5786 ; AVX512F-NEXT: je LBB24_50
5787 ; AVX512F-NEXT: LBB24_49: ## %cond.load70
5788 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5789 ; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5790 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5791 ; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000
5792 ; AVX512F-NEXT: je LBB24_52
5793 ; AVX512F-NEXT: LBB24_51: ## %cond.load73
5794 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5795 ; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5796 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5797 ; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000
5798 ; AVX512F-NEXT: je LBB24_54
5799 ; AVX512F-NEXT: LBB24_53: ## %cond.load76
5800 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5801 ; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5802 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5803 ; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000
5804 ; AVX512F-NEXT: je LBB24_56
5805 ; AVX512F-NEXT: LBB24_55: ## %cond.load79
5806 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5807 ; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5808 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5809 ; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000
5810 ; AVX512F-NEXT: je LBB24_58
5811 ; AVX512F-NEXT: LBB24_57: ## %cond.load82
5812 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5813 ; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5814 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5815 ; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000
5816 ; AVX512F-NEXT: je LBB24_60
5817 ; AVX512F-NEXT: LBB24_59: ## %cond.load85
5818 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5819 ; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5820 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5821 ; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5822 ; AVX512F-NEXT: je LBB24_62
5823 ; AVX512F-NEXT: LBB24_61: ## %cond.load88
5824 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5825 ; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5826 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5827 ; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5828 ; AVX512F-NEXT: je LBB24_64
5829 ; AVX512F-NEXT: LBB24_63: ## %cond.load91
5830 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5831 ; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
5832 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5833 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
5834 ; AVX512F-NEXT: retq
5836 ; AVX512VLDQ-LABEL: load_v32i8_v32i8:
5837 ; AVX512VLDQ: ## %bb.0:
5838 ; AVX512VLDQ-NEXT: vpmovmskb %ymm0, %eax
5839 ; AVX512VLDQ-NEXT: testb $1, %al
5840 ; AVX512VLDQ-NEXT: jne LBB24_1
5841 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
5842 ; AVX512VLDQ-NEXT: testb $2, %al
5843 ; AVX512VLDQ-NEXT: jne LBB24_3
5844 ; AVX512VLDQ-NEXT: LBB24_4: ## %else2
5845 ; AVX512VLDQ-NEXT: testb $4, %al
5846 ; AVX512VLDQ-NEXT: jne LBB24_5
5847 ; AVX512VLDQ-NEXT: LBB24_6: ## %else5
5848 ; AVX512VLDQ-NEXT: testb $8, %al
5849 ; AVX512VLDQ-NEXT: jne LBB24_7
5850 ; AVX512VLDQ-NEXT: LBB24_8: ## %else8
5851 ; AVX512VLDQ-NEXT: testb $16, %al
5852 ; AVX512VLDQ-NEXT: jne LBB24_9
5853 ; AVX512VLDQ-NEXT: LBB24_10: ## %else11
5854 ; AVX512VLDQ-NEXT: testb $32, %al
5855 ; AVX512VLDQ-NEXT: jne LBB24_11
5856 ; AVX512VLDQ-NEXT: LBB24_12: ## %else14
5857 ; AVX512VLDQ-NEXT: testb $64, %al
5858 ; AVX512VLDQ-NEXT: jne LBB24_13
5859 ; AVX512VLDQ-NEXT: LBB24_14: ## %else17
5860 ; AVX512VLDQ-NEXT: testb $-128, %al
5861 ; AVX512VLDQ-NEXT: jne LBB24_15
5862 ; AVX512VLDQ-NEXT: LBB24_16: ## %else20
5863 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
5864 ; AVX512VLDQ-NEXT: jne LBB24_17
5865 ; AVX512VLDQ-NEXT: LBB24_18: ## %else23
5866 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
5867 ; AVX512VLDQ-NEXT: jne LBB24_19
5868 ; AVX512VLDQ-NEXT: LBB24_20: ## %else26
5869 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
5870 ; AVX512VLDQ-NEXT: jne LBB24_21
5871 ; AVX512VLDQ-NEXT: LBB24_22: ## %else29
5872 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
5873 ; AVX512VLDQ-NEXT: jne LBB24_23
5874 ; AVX512VLDQ-NEXT: LBB24_24: ## %else32
5875 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
5876 ; AVX512VLDQ-NEXT: jne LBB24_25
5877 ; AVX512VLDQ-NEXT: LBB24_26: ## %else35
5878 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
5879 ; AVX512VLDQ-NEXT: jne LBB24_27
5880 ; AVX512VLDQ-NEXT: LBB24_28: ## %else38
5881 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
5882 ; AVX512VLDQ-NEXT: jne LBB24_29
5883 ; AVX512VLDQ-NEXT: LBB24_30: ## %else41
5884 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
5885 ; AVX512VLDQ-NEXT: jne LBB24_31
5886 ; AVX512VLDQ-NEXT: LBB24_32: ## %else44
5887 ; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000
5888 ; AVX512VLDQ-NEXT: jne LBB24_33
5889 ; AVX512VLDQ-NEXT: LBB24_34: ## %else47
5890 ; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000
5891 ; AVX512VLDQ-NEXT: jne LBB24_35
5892 ; AVX512VLDQ-NEXT: LBB24_36: ## %else50
5893 ; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000
5894 ; AVX512VLDQ-NEXT: jne LBB24_37
5895 ; AVX512VLDQ-NEXT: LBB24_38: ## %else53
5896 ; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000
5897 ; AVX512VLDQ-NEXT: jne LBB24_39
5898 ; AVX512VLDQ-NEXT: LBB24_40: ## %else56
5899 ; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000
5900 ; AVX512VLDQ-NEXT: jne LBB24_41
5901 ; AVX512VLDQ-NEXT: LBB24_42: ## %else59
5902 ; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000
5903 ; AVX512VLDQ-NEXT: jne LBB24_43
5904 ; AVX512VLDQ-NEXT: LBB24_44: ## %else62
5905 ; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000
5906 ; AVX512VLDQ-NEXT: jne LBB24_45
5907 ; AVX512VLDQ-NEXT: LBB24_46: ## %else65
5908 ; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000
5909 ; AVX512VLDQ-NEXT: jne LBB24_47
5910 ; AVX512VLDQ-NEXT: LBB24_48: ## %else68
5911 ; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000
5912 ; AVX512VLDQ-NEXT: jne LBB24_49
5913 ; AVX512VLDQ-NEXT: LBB24_50: ## %else71
5914 ; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000
5915 ; AVX512VLDQ-NEXT: jne LBB24_51
5916 ; AVX512VLDQ-NEXT: LBB24_52: ## %else74
5917 ; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000
5918 ; AVX512VLDQ-NEXT: jne LBB24_53
5919 ; AVX512VLDQ-NEXT: LBB24_54: ## %else77
5920 ; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000
5921 ; AVX512VLDQ-NEXT: jne LBB24_55
5922 ; AVX512VLDQ-NEXT: LBB24_56: ## %else80
5923 ; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000
5924 ; AVX512VLDQ-NEXT: jne LBB24_57
5925 ; AVX512VLDQ-NEXT: LBB24_58: ## %else83
5926 ; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000
5927 ; AVX512VLDQ-NEXT: jne LBB24_59
5928 ; AVX512VLDQ-NEXT: LBB24_60: ## %else86
5929 ; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5930 ; AVX512VLDQ-NEXT: jne LBB24_61
5931 ; AVX512VLDQ-NEXT: LBB24_62: ## %else89
5932 ; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5933 ; AVX512VLDQ-NEXT: jne LBB24_63
5934 ; AVX512VLDQ-NEXT: LBB24_64: ## %else92
5935 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
5936 ; AVX512VLDQ-NEXT: retq
5937 ; AVX512VLDQ-NEXT: LBB24_1: ## %cond.load
5938 ; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
5939 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5940 ; AVX512VLDQ-NEXT: testb $2, %al
5941 ; AVX512VLDQ-NEXT: je LBB24_4
5942 ; AVX512VLDQ-NEXT: LBB24_3: ## %cond.load1
5943 ; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5944 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5945 ; AVX512VLDQ-NEXT: testb $4, %al
5946 ; AVX512VLDQ-NEXT: je LBB24_6
5947 ; AVX512VLDQ-NEXT: LBB24_5: ## %cond.load4
5948 ; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5949 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5950 ; AVX512VLDQ-NEXT: testb $8, %al
5951 ; AVX512VLDQ-NEXT: je LBB24_8
5952 ; AVX512VLDQ-NEXT: LBB24_7: ## %cond.load7
5953 ; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5954 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5955 ; AVX512VLDQ-NEXT: testb $16, %al
5956 ; AVX512VLDQ-NEXT: je LBB24_10
5957 ; AVX512VLDQ-NEXT: LBB24_9: ## %cond.load10
5958 ; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5959 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5960 ; AVX512VLDQ-NEXT: testb $32, %al
5961 ; AVX512VLDQ-NEXT: je LBB24_12
5962 ; AVX512VLDQ-NEXT: LBB24_11: ## %cond.load13
5963 ; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5964 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5965 ; AVX512VLDQ-NEXT: testb $64, %al
5966 ; AVX512VLDQ-NEXT: je LBB24_14
5967 ; AVX512VLDQ-NEXT: LBB24_13: ## %cond.load16
5968 ; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5969 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5970 ; AVX512VLDQ-NEXT: testb $-128, %al
5971 ; AVX512VLDQ-NEXT: je LBB24_16
5972 ; AVX512VLDQ-NEXT: LBB24_15: ## %cond.load19
5973 ; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5974 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5975 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
5976 ; AVX512VLDQ-NEXT: je LBB24_18
5977 ; AVX512VLDQ-NEXT: LBB24_17: ## %cond.load22
5978 ; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5979 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5980 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
5981 ; AVX512VLDQ-NEXT: je LBB24_20
5982 ; AVX512VLDQ-NEXT: LBB24_19: ## %cond.load25
5983 ; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5984 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5985 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
5986 ; AVX512VLDQ-NEXT: je LBB24_22
5987 ; AVX512VLDQ-NEXT: LBB24_21: ## %cond.load28
5988 ; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5989 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5990 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
5991 ; AVX512VLDQ-NEXT: je LBB24_24
5992 ; AVX512VLDQ-NEXT: LBB24_23: ## %cond.load31
5993 ; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5994 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5995 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
5996 ; AVX512VLDQ-NEXT: je LBB24_26
5997 ; AVX512VLDQ-NEXT: LBB24_25: ## %cond.load34
5998 ; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5999 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6000 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
6001 ; AVX512VLDQ-NEXT: je LBB24_28
6002 ; AVX512VLDQ-NEXT: LBB24_27: ## %cond.load37
6003 ; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
6004 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6005 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
6006 ; AVX512VLDQ-NEXT: je LBB24_30
6007 ; AVX512VLDQ-NEXT: LBB24_29: ## %cond.load40
6008 ; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
6009 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6010 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
6011 ; AVX512VLDQ-NEXT: je LBB24_32
6012 ; AVX512VLDQ-NEXT: LBB24_31: ## %cond.load43
6013 ; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
6014 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6015 ; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000
6016 ; AVX512VLDQ-NEXT: je LBB24_34
6017 ; AVX512VLDQ-NEXT: LBB24_33: ## %cond.load46
6018 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6019 ; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
6020 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6021 ; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000
6022 ; AVX512VLDQ-NEXT: je LBB24_36
6023 ; AVX512VLDQ-NEXT: LBB24_35: ## %cond.load49
6024 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6025 ; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
6026 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6027 ; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000
6028 ; AVX512VLDQ-NEXT: je LBB24_38
6029 ; AVX512VLDQ-NEXT: LBB24_37: ## %cond.load52
6030 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6031 ; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
6032 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6033 ; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000
6034 ; AVX512VLDQ-NEXT: je LBB24_40
6035 ; AVX512VLDQ-NEXT: LBB24_39: ## %cond.load55
6036 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6037 ; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
6038 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6039 ; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000
6040 ; AVX512VLDQ-NEXT: je LBB24_42
6041 ; AVX512VLDQ-NEXT: LBB24_41: ## %cond.load58
6042 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6043 ; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
6044 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6045 ; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000
6046 ; AVX512VLDQ-NEXT: je LBB24_44
6047 ; AVX512VLDQ-NEXT: LBB24_43: ## %cond.load61
6048 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6049 ; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
6050 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6051 ; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000
6052 ; AVX512VLDQ-NEXT: je LBB24_46
6053 ; AVX512VLDQ-NEXT: LBB24_45: ## %cond.load64
6054 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6055 ; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
6056 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6057 ; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000
6058 ; AVX512VLDQ-NEXT: je LBB24_48
6059 ; AVX512VLDQ-NEXT: LBB24_47: ## %cond.load67
6060 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6061 ; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
6062 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6063 ; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000
6064 ; AVX512VLDQ-NEXT: je LBB24_50
6065 ; AVX512VLDQ-NEXT: LBB24_49: ## %cond.load70
6066 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6067 ; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
6068 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6069 ; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000
6070 ; AVX512VLDQ-NEXT: je LBB24_52
6071 ; AVX512VLDQ-NEXT: LBB24_51: ## %cond.load73
6072 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6073 ; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
6074 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6075 ; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000
6076 ; AVX512VLDQ-NEXT: je LBB24_54
6077 ; AVX512VLDQ-NEXT: LBB24_53: ## %cond.load76
6078 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6079 ; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
6080 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6081 ; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000
6082 ; AVX512VLDQ-NEXT: je LBB24_56
6083 ; AVX512VLDQ-NEXT: LBB24_55: ## %cond.load79
6084 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6085 ; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
6086 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6087 ; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000
6088 ; AVX512VLDQ-NEXT: je LBB24_58
6089 ; AVX512VLDQ-NEXT: LBB24_57: ## %cond.load82
6090 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6091 ; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
6092 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6093 ; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000
6094 ; AVX512VLDQ-NEXT: je LBB24_60
6095 ; AVX512VLDQ-NEXT: LBB24_59: ## %cond.load85
6096 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6097 ; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
6098 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6099 ; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000
6100 ; AVX512VLDQ-NEXT: je LBB24_62
6101 ; AVX512VLDQ-NEXT: LBB24_61: ## %cond.load88
6102 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6103 ; AVX512VLDQ-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
6104 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6105 ; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
6106 ; AVX512VLDQ-NEXT: je LBB24_64
6107 ; AVX512VLDQ-NEXT: LBB24_63: ## %cond.load91
6108 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6109 ; AVX512VLDQ-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
6110 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6111 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
6112 ; AVX512VLDQ-NEXT: retq
6114 ; AVX512VLBW-LABEL: load_v32i8_v32i8:
6115 ; AVX512VLBW: ## %bb.0:
6116 ; AVX512VLBW-NEXT: vpmovb2m %ymm0, %k1
6117 ; AVX512VLBW-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
6118 ; AVX512VLBW-NEXT: retq
6119 %mask = icmp slt <32 x i8> %trigger, zeroinitializer
6120 %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1> %mask, <32 x i8> %dst)
6124 ;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
6126 ; 128-bit FP vectors are supported with AVX.
6128 define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
6129 ; SSE2-LABEL: mload_constmask_v4f32:
6131 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6132 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6133 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6134 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6135 ; SSE2-NEXT: movaps %xmm0, %xmm1
6136 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
6137 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[2,0]
6138 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
6141 ; SSE42-LABEL: mload_constmask_v4f32:
6143 ; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6144 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6145 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6146 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
6149 ; AVX1OR2-LABEL: mload_constmask_v4f32:
6150 ; AVX1OR2: ## %bb.0:
6151 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
6152 ; AVX1OR2-NEXT: retq
6154 ; AVX512F-LABEL: mload_constmask_v4f32:
6155 ; AVX512F: ## %bb.0:
6156 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
6157 ; AVX512F-NEXT: movw $13, %ax
6158 ; AVX512F-NEXT: kmovw %eax, %k1
6159 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
6160 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
6161 ; AVX512F-NEXT: vzeroupper
6162 ; AVX512F-NEXT: retq
6164 ; AVX512VLDQ-LABEL: mload_constmask_v4f32:
6165 ; AVX512VLDQ: ## %bb.0:
6166 ; AVX512VLDQ-NEXT: movb $13, %al
6167 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6168 ; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1}
6169 ; AVX512VLDQ-NEXT: retq
6171 ; AVX512VLBW-LABEL: mload_constmask_v4f32:
6172 ; AVX512VLBW: ## %bb.0:
6173 ; AVX512VLBW-NEXT: movb $13, %al
6174 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6175 ; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1}
6176 ; AVX512VLBW-NEXT: retq
6177 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
6178 ret <4 x float> %res
6181 define <4 x float> @mload_constmask_v4f32_all(<4 x float>* %addr) {
6182 ; SSE-LABEL: mload_constmask_v4f32_all:
6184 ; SSE-NEXT: movups (%rdi), %xmm0
6187 ; AVX1OR2-LABEL: mload_constmask_v4f32_all:
6188 ; AVX1OR2: ## %bb.0:
6189 ; AVX1OR2-NEXT: vmovups (%rdi), %xmm0
6190 ; AVX1OR2-NEXT: retq
6192 ; AVX512F-LABEL: mload_constmask_v4f32_all:
6193 ; AVX512F: ## %bb.0:
6194 ; AVX512F-NEXT: movw $15, %ax
6195 ; AVX512F-NEXT: kmovw %eax, %k1
6196 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
6197 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
6198 ; AVX512F-NEXT: vzeroupper
6199 ; AVX512F-NEXT: retq
6201 ; AVX512VL-LABEL: mload_constmask_v4f32_all:
6202 ; AVX512VL: ## %bb.0:
6203 ; AVX512VL-NEXT: kxnorw %k0, %k0, %k1
6204 ; AVX512VL-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
6205 ; AVX512VL-NEXT: retq
6206 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
6207 ret <4 x float> %res
6210 define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
6211 ; SSE-LABEL: mload_constmask_v2f64:
6213 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6216 ; AVX-LABEL: mload_constmask_v2f64:
6218 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6220 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
6221 ret <2 x double> %res
6224 ; 128-bit integer vectors are supported with AVX2.
6226 define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
6227 ; SSE2-LABEL: mload_constmask_v4i32:
6229 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6230 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
6231 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6232 ; SSE2-NEXT: movaps %xmm1, %xmm2
6233 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2]
6234 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6235 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
6236 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
6237 ; SSE2-NEXT: movaps %xmm1, %xmm0
6240 ; SSE42-LABEL: mload_constmask_v4i32:
6242 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
6243 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
6244 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0
6247 ; AVX1-LABEL: mload_constmask_v4i32:
6249 ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
6250 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
6251 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
6254 ; AVX2-LABEL: mload_constmask_v4i32:
6256 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
6257 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1
6258 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
6261 ; AVX512F-LABEL: mload_constmask_v4i32:
6262 ; AVX512F: ## %bb.0:
6263 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
6264 ; AVX512F-NEXT: movw $14, %ax
6265 ; AVX512F-NEXT: kmovw %eax, %k1
6266 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
6267 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
6268 ; AVX512F-NEXT: vzeroupper
6269 ; AVX512F-NEXT: retq
6271 ; AVX512VLDQ-LABEL: mload_constmask_v4i32:
6272 ; AVX512VLDQ: ## %bb.0:
6273 ; AVX512VLDQ-NEXT: movb $14, %al
6274 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6275 ; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
6276 ; AVX512VLDQ-NEXT: retq
6278 ; AVX512VLBW-LABEL: mload_constmask_v4i32:
6279 ; AVX512VLBW: ## %bb.0:
6280 ; AVX512VLBW-NEXT: movb $14, %al
6281 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6282 ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
6283 ; AVX512VLBW-NEXT: retq
6284 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
6288 define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
6289 ; SSE2-LABEL: mload_constmask_v2i64:
6291 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
6292 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6295 ; SSE42-LABEL: mload_constmask_v2i64:
6297 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm0
6300 ; AVX-LABEL: mload_constmask_v2i64:
6302 ; AVX-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0
6304 %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
6308 ; 256-bit FP vectors are supported with AVX.
6310 define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
6311 ; SSE2-LABEL: mload_constmask_v8f32:
6313 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6314 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6315 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
6316 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6317 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
6318 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
6319 ; SSE2-NEXT: movaps %xmm2, %xmm0
6322 ; SSE42-LABEL: mload_constmask_v8f32:
6324 ; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6325 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
6326 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
6327 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6330 ; AVX1OR2-LABEL: mload_constmask_v8f32:
6331 ; AVX1OR2: ## %bb.0:
6332 ; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
6333 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
6334 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
6335 ; AVX1OR2-NEXT: retq
6337 ; AVX512F-LABEL: mload_constmask_v8f32:
6338 ; AVX512F: ## %bb.0:
6339 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6340 ; AVX512F-NEXT: movw $7, %ax
6341 ; AVX512F-NEXT: kmovw %eax, %k1
6342 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
6343 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6344 ; AVX512F-NEXT: retq
6346 ; AVX512VLDQ-LABEL: mload_constmask_v8f32:
6347 ; AVX512VLDQ: ## %bb.0:
6348 ; AVX512VLDQ-NEXT: movb $7, %al
6349 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6350 ; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1}
6351 ; AVX512VLDQ-NEXT: retq
6353 ; AVX512VLBW-LABEL: mload_constmask_v8f32:
6354 ; AVX512VLBW: ## %bb.0:
6355 ; AVX512VLBW-NEXT: movb $7, %al
6356 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6357 ; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1}
6358 ; AVX512VLBW-NEXT: retq
6359 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
6360 ret <8 x float> %res
6363 define <8 x float> @mload_constmask_v8f32_zero(<8 x float>* %addr, <8 x float> %dst) {
6364 ; SSE2-LABEL: mload_constmask_v8f32_zero:
6366 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6367 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
6368 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6369 ; SSE2-NEXT: xorps %xmm1, %xmm1
6372 ; SSE42-LABEL: mload_constmask_v8f32_zero:
6374 ; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
6375 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero
6376 ; SSE42-NEXT: xorps %xmm1, %xmm1
6379 ; AVX1OR2-LABEL: mload_constmask_v8f32_zero:
6380 ; AVX1OR2: ## %bb.0:
6381 ; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0]
6382 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
6383 ; AVX1OR2-NEXT: retq
6385 ; AVX512F-LABEL: mload_constmask_v8f32_zero:
6386 ; AVX512F: ## %bb.0:
6387 ; AVX512F-NEXT: movw $7, %ax
6388 ; AVX512F-NEXT: kmovw %eax, %k1
6389 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
6390 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6391 ; AVX512F-NEXT: retq
6393 ; AVX512VLDQ-LABEL: mload_constmask_v8f32_zero:
6394 ; AVX512VLDQ: ## %bb.0:
6395 ; AVX512VLDQ-NEXT: movb $7, %al
6396 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6397 ; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
6398 ; AVX512VLDQ-NEXT: retq
6400 ; AVX512VLBW-LABEL: mload_constmask_v8f32_zero:
6401 ; AVX512VLBW: ## %bb.0:
6402 ; AVX512VLBW-NEXT: movb $7, %al
6403 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6404 ; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
6405 ; AVX512VLBW-NEXT: retq
6406 %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer)
6407 ret <8 x float> %res
6410 define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
6411 ; SSE-LABEL: mload_constmask_v4f64:
6413 ; SSE-NEXT: movups (%rdi), %xmm0
6414 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
6417 ; AVX1OR2-LABEL: mload_constmask_v4f64:
6418 ; AVX1OR2: ## %bb.0:
6419 ; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
6420 ; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
6421 ; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
6422 ; AVX1OR2-NEXT: retq
6424 ; AVX512F-LABEL: mload_constmask_v4f64:
6425 ; AVX512F: ## %bb.0:
6426 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6427 ; AVX512F-NEXT: movb $7, %al
6428 ; AVX512F-NEXT: kmovw %eax, %k1
6429 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6430 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6431 ; AVX512F-NEXT: retq
6433 ; AVX512VLDQ-LABEL: mload_constmask_v4f64:
6434 ; AVX512VLDQ: ## %bb.0:
6435 ; AVX512VLDQ-NEXT: movb $7, %al
6436 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6437 ; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1}
6438 ; AVX512VLDQ-NEXT: retq
6440 ; AVX512VLBW-LABEL: mload_constmask_v4f64:
6441 ; AVX512VLBW: ## %bb.0:
6442 ; AVX512VLBW-NEXT: movb $7, %al
6443 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6444 ; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1}
6445 ; AVX512VLBW-NEXT: retq
6446 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
6447 ret <4 x double> %res
6450 ; 256-bit integer vectors are supported with AVX2.
6452 define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
6453 ; SSE2-LABEL: mload_constmask_v8i32:
6455 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6456 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6457 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
6458 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6459 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
6460 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
6461 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6462 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
6463 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
6464 ; SSE2-NEXT: movaps %xmm2, %xmm0
6467 ; SSE42-LABEL: mload_constmask_v8i32:
6469 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0
6470 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
6471 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
6472 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1
6475 ; AVX1OR2-LABEL: mload_constmask_v8i32:
6476 ; AVX1OR2: ## %bb.0:
6477 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
6478 ; AVX1OR2-NEXT: retq
6480 ; AVX512F-LABEL: mload_constmask_v8i32:
6481 ; AVX512F: ## %bb.0:
6482 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6483 ; AVX512F-NEXT: movw $135, %ax
6484 ; AVX512F-NEXT: kmovw %eax, %k1
6485 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
6486 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6487 ; AVX512F-NEXT: retq
6489 ; AVX512VLDQ-LABEL: mload_constmask_v8i32:
6490 ; AVX512VLDQ: ## %bb.0:
6491 ; AVX512VLDQ-NEXT: movb $-121, %al
6492 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6493 ; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
6494 ; AVX512VLDQ-NEXT: retq
6496 ; AVX512VLBW-LABEL: mload_constmask_v8i32:
6497 ; AVX512VLBW: ## %bb.0:
6498 ; AVX512VLBW-NEXT: movb $-121, %al
6499 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6500 ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
6501 ; AVX512VLBW-NEXT: retq
6502 %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
6506 define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
6507 ; SSE2-LABEL: mload_constmask_v4i64:
6509 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
6510 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
6511 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6514 ; SSE42-LABEL: mload_constmask_v4i64:
6516 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm0
6517 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm1
6520 ; AVX1OR2-LABEL: mload_constmask_v4i64:
6521 ; AVX1OR2: ## %bb.0:
6522 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
6523 ; AVX1OR2-NEXT: retq
6525 ; AVX512F-LABEL: mload_constmask_v4i64:
6526 ; AVX512F: ## %bb.0:
6527 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6528 ; AVX512F-NEXT: movb $9, %al
6529 ; AVX512F-NEXT: kmovw %eax, %k1
6530 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1}
6531 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6532 ; AVX512F-NEXT: retq
6534 ; AVX512VLDQ-LABEL: mload_constmask_v4i64:
6535 ; AVX512VLDQ: ## %bb.0:
6536 ; AVX512VLDQ-NEXT: movb $9, %al
6537 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6538 ; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
6539 ; AVX512VLDQ-NEXT: retq
6541 ; AVX512VLBW-LABEL: mload_constmask_v4i64:
6542 ; AVX512VLBW: ## %bb.0:
6543 ; AVX512VLBW-NEXT: movb $9, %al
6544 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6545 ; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
6546 ; AVX512VLBW-NEXT: retq
6547 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
6551 ; 512-bit FP vectors are supported with AVX512.
6553 define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
6554 ; SSE-LABEL: mload_constmask_v8f64:
6556 ; SSE-NEXT: movups (%rdi), %xmm0
6557 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
6558 ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
6561 ; AVX1OR2-LABEL: mload_constmask_v8f64:
6562 ; AVX1OR2: ## %bb.0:
6563 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
6564 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
6565 ; AVX1OR2-NEXT: retq
6567 ; AVX512F-LABEL: mload_constmask_v8f64:
6568 ; AVX512F: ## %bb.0:
6569 ; AVX512F-NEXT: movb $-121, %al
6570 ; AVX512F-NEXT: kmovw %eax, %k1
6571 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6572 ; AVX512F-NEXT: retq
6574 ; AVX512VLDQ-LABEL: mload_constmask_v8f64:
6575 ; AVX512VLDQ: ## %bb.0:
6576 ; AVX512VLDQ-NEXT: movb $-121, %al
6577 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6578 ; AVX512VLDQ-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6579 ; AVX512VLDQ-NEXT: retq
6581 ; AVX512VLBW-LABEL: mload_constmask_v8f64:
6582 ; AVX512VLBW: ## %bb.0:
6583 ; AVX512VLBW-NEXT: movb $-121, %al
6584 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6585 ; AVX512VLBW-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6586 ; AVX512VLBW-NEXT: retq
6587 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
6588 ret <8 x double> %res
6591 ; If the pass-through operand is undef, no blend is needed.
6593 define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
6594 ; SSE-LABEL: mload_constmask_v4f64_undef_passthrough:
6596 ; SSE-NEXT: movups (%rdi), %xmm0
6597 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
6600 ; AVX1OR2-LABEL: mload_constmask_v4f64_undef_passthrough:
6601 ; AVX1OR2: ## %bb.0:
6602 ; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
6603 ; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
6604 ; AVX1OR2-NEXT: retq
6606 ; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
6607 ; AVX512F: ## %bb.0:
6608 ; AVX512F-NEXT: movb $7, %al
6609 ; AVX512F-NEXT: kmovw %eax, %k1
6610 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
6611 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6612 ; AVX512F-NEXT: retq
6614 ; AVX512VLDQ-LABEL: mload_constmask_v4f64_undef_passthrough:
6615 ; AVX512VLDQ: ## %bb.0:
6616 ; AVX512VLDQ-NEXT: movb $7, %al
6617 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6618 ; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
6619 ; AVX512VLDQ-NEXT: retq
6621 ; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough:
6622 ; AVX512VLBW: ## %bb.0:
6623 ; AVX512VLBW-NEXT: movb $7, %al
6624 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6625 ; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
6626 ; AVX512VLBW-NEXT: retq
6627 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
6628 ret <4 x double> %res
6631 define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
6632 ; SSE-LABEL: mload_constmask_v4i64_undef_passthrough:
6634 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
6635 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
6636 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6639 ; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
6641 ; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
6642 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
6645 ; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
6647 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
6648 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
6651 ; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
6652 ; AVX512F: ## %bb.0:
6653 ; AVX512F-NEXT: movb $6, %al
6654 ; AVX512F-NEXT: kmovw %eax, %k1
6655 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
6656 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6657 ; AVX512F-NEXT: retq
6659 ; AVX512VLDQ-LABEL: mload_constmask_v4i64_undef_passthrough:
6660 ; AVX512VLDQ: ## %bb.0:
6661 ; AVX512VLDQ-NEXT: movb $6, %al
6662 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6663 ; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
6664 ; AVX512VLDQ-NEXT: retq
6666 ; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough:
6667 ; AVX512VLBW: ## %bb.0:
6668 ; AVX512VLBW-NEXT: movb $6, %al
6669 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6670 ; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
6671 ; AVX512VLBW-NEXT: retq
6672 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
6676 ; When only one element of the mask is set, reduce to a scalar load.
6678 define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
6679 ; SSE2-LABEL: load_one_mask_bit_set1:
6681 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6682 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6685 ; SSE42-LABEL: load_one_mask_bit_set1:
6687 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0
6690 ; AVX-LABEL: load_one_mask_bit_set1:
6692 ; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
6694 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
6698 ; Choose a different element to show that the correct address offset is produced.
6700 define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
6701 ; SSE2-LABEL: load_one_mask_bit_set2:
6703 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6704 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
6705 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
6708 ; SSE42-LABEL: load_one_mask_bit_set2:
6710 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6713 ; AVX-LABEL: load_one_mask_bit_set2:
6715 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6717 %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
6718 ret <4 x float> %res
6721 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
6723 define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
6724 ; SSE2-LABEL: load_one_mask_bit_set3:
6726 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
6729 ; SSE42-LABEL: load_one_mask_bit_set3:
6731 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm1
6734 ; AVX1-LABEL: load_one_mask_bit_set3:
6736 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
6737 ; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
6738 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
6741 ; AVX2-LABEL: load_one_mask_bit_set3:
6743 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
6744 ; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
6745 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
6748 ; AVX512-LABEL: load_one_mask_bit_set3:
6750 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6751 ; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
6752 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
6754 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
6758 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
6760 define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
6761 ; SSE-LABEL: load_one_mask_bit_set4:
6763 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
6766 ; AVX-LABEL: load_one_mask_bit_set4:
6768 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
6769 ; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
6770 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
6772 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
6773 ret <4 x double> %res
6776 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
6778 define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
6779 ; SSE-LABEL: load_one_mask_bit_set5:
6781 ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
6784 ; AVX1OR2-LABEL: load_one_mask_bit_set5:
6785 ; AVX1OR2: ## %bb.0:
6786 ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2
6787 ; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
6788 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
6789 ; AVX1OR2-NEXT: retq
6791 ; AVX512-LABEL: load_one_mask_bit_set5:
6793 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
6794 ; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
6795 ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
6797 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
6798 ret <8 x double> %res
6801 define i32 @pr38986(i1 %c, i32* %p) {
6802 ; SSE-LABEL: pr38986:
6804 ; SSE-NEXT: testb $1, %dil
6805 ; SSE-NEXT: ## implicit-def: $eax
6806 ; SSE-NEXT: je LBB43_2
6807 ; SSE-NEXT: ## %bb.1: ## %cond.load
6808 ; SSE-NEXT: movl (%rsi), %eax
6809 ; SSE-NEXT: LBB43_2: ## %else
6812 ; AVX-LABEL: pr38986:
6814 ; AVX-NEXT: testb $1, %dil
6815 ; AVX-NEXT: ## implicit-def: $eax
6816 ; AVX-NEXT: je LBB43_2
6817 ; AVX-NEXT: ## %bb.1: ## %cond.load
6818 ; AVX-NEXT: movl (%rsi), %eax
6819 ; AVX-NEXT: LBB43_2: ## %else
6821 %vc = insertelement <1 x i1> undef, i1 %c, i32 0
6822 %vp = bitcast i32* %p to <1 x i32>*
6823 %L = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32 (<1 x i32>* %vp, i32 4, <1 x i1> %vc, <1 x i32> undef)
6824 %ret = bitcast <1 x i32> %L to i32
6828 define <2 x double> @zero_mask(<2 x double>* %addr, <2 x double> %dst) {
6829 ; SSE-LABEL: zero_mask:
6833 ; AVX-LABEL: zero_mask:
6836 %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> zeroinitializer, <2 x double> %dst)
6837 ret <2 x double> %res
6840 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
6841 declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
6842 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
6843 declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
6845 declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
6846 declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
6847 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
6848 declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
6850 declare <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>*, i32, <8 x i1>, <8 x i64>)
6851 declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
6852 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
6853 declare <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>*, i32, <1 x i1>, <1 x i64>)
6855 declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
6856 declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
6857 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
6858 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
6859 declare <1 x i32> @llvm.masked.load.v1i32.p0v1i32(<1 x i32>*, i32, <1 x i1>, <1 x i32>)
6861 declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
6862 declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
6863 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
6864 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
6866 declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
6867 declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
6868 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
6869 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)