1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
5 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
6 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
7 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
8 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
9 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512f,avx512bw,avx512dq,avx512vl -verify-machineinstrs | FileCheck %s --check-prefixes=X86-AVX512
15 define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %dst) {
16 ; SSE-LABEL: load_v1f64_v1i64:
18 ; SSE-NEXT: testq %rdi, %rdi
19 ; SSE-NEXT: jne LBB0_2
20 ; SSE-NEXT: ## %bb.1: ## %cond.load
21 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
22 ; SSE-NEXT: LBB0_2: ## %else
25 ; AVX-LABEL: load_v1f64_v1i64:
27 ; AVX-NEXT: testq %rdi, %rdi
28 ; AVX-NEXT: jne LBB0_2
29 ; AVX-NEXT: ## %bb.1: ## %cond.load
30 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
31 ; AVX-NEXT: LBB0_2: ## %else
34 ; X86-AVX512-LABEL: load_v1f64_v1i64:
35 ; X86-AVX512: ## %bb.0:
36 ; X86-AVX512-NEXT: subl $12, %esp
37 ; X86-AVX512-NEXT: .cfi_def_cfa_offset 16
38 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
39 ; X86-AVX512-NEXT: orl {{[0-9]+}}(%esp), %eax
40 ; X86-AVX512-NEXT: jne LBB0_1
41 ; X86-AVX512-NEXT: ## %bb.2: ## %cond.load
42 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
43 ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
44 ; X86-AVX512-NEXT: jmp LBB0_3
45 ; X86-AVX512-NEXT: LBB0_1:
46 ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
47 ; X86-AVX512-NEXT: LBB0_3: ## %else
48 ; X86-AVX512-NEXT: vmovsd %xmm0, (%esp)
49 ; X86-AVX512-NEXT: fldl (%esp)
50 ; X86-AVX512-NEXT: addl $12, %esp
51 ; X86-AVX512-NEXT: retl
52 %mask = icmp eq <1 x i64> %trigger, zeroinitializer
53 %res = call <1 x double> @llvm.masked.load.v1f64.p0(ptr %addr, i32 4, <1 x i1> %mask, <1 x double> %dst)
57 define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
58 ; SSE2-LABEL: load_v2f64_v2i64:
60 ; SSE2-NEXT: pxor %xmm2, %xmm2
61 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
62 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
63 ; SSE2-NEXT: pand %xmm2, %xmm0
64 ; SSE2-NEXT: movmskpd %xmm0, %eax
65 ; SSE2-NEXT: testb $1, %al
66 ; SSE2-NEXT: jne LBB1_1
67 ; SSE2-NEXT: ## %bb.2: ## %else
68 ; SSE2-NEXT: testb $2, %al
69 ; SSE2-NEXT: jne LBB1_3
70 ; SSE2-NEXT: LBB1_4: ## %else2
71 ; SSE2-NEXT: movaps %xmm1, %xmm0
73 ; SSE2-NEXT: LBB1_1: ## %cond.load
74 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
75 ; SSE2-NEXT: testb $2, %al
76 ; SSE2-NEXT: je LBB1_4
77 ; SSE2-NEXT: LBB1_3: ## %cond.load1
78 ; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
79 ; SSE2-NEXT: movaps %xmm1, %xmm0
82 ; SSE42-LABEL: load_v2f64_v2i64:
84 ; SSE42-NEXT: pxor %xmm2, %xmm2
85 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2
86 ; SSE42-NEXT: movmskpd %xmm2, %eax
87 ; SSE42-NEXT: testb $1, %al
88 ; SSE42-NEXT: jne LBB1_1
89 ; SSE42-NEXT: ## %bb.2: ## %else
90 ; SSE42-NEXT: testb $2, %al
91 ; SSE42-NEXT: jne LBB1_3
92 ; SSE42-NEXT: LBB1_4: ## %else2
93 ; SSE42-NEXT: movaps %xmm1, %xmm0
95 ; SSE42-NEXT: LBB1_1: ## %cond.load
96 ; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
97 ; SSE42-NEXT: testb $2, %al
98 ; SSE42-NEXT: je LBB1_4
99 ; SSE42-NEXT: LBB1_3: ## %cond.load1
100 ; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
101 ; SSE42-NEXT: movaps %xmm1, %xmm0
104 ; AVX1OR2-LABEL: load_v2f64_v2i64:
106 ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
107 ; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
108 ; AVX1OR2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
109 ; AVX1OR2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
112 ; AVX512F-LABEL: load_v2f64_v2i64:
114 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
115 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
116 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
117 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
118 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
119 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
120 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
121 ; AVX512F-NEXT: vzeroupper
124 ; AVX512VL-LABEL: load_v2f64_v2i64:
125 ; AVX512VL: ## %bb.0:
126 ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
127 ; AVX512VL-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
128 ; AVX512VL-NEXT: retq
130 ; X86-AVX512-LABEL: load_v2f64_v2i64:
131 ; X86-AVX512: ## %bb.0:
132 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
133 ; X86-AVX512-NEXT: vptestnmq %xmm0, %xmm0, %k1
134 ; X86-AVX512-NEXT: vblendmpd (%eax), %xmm1, %xmm0 {%k1}
135 ; X86-AVX512-NEXT: retl
136 %mask = icmp eq <2 x i64> %trigger, zeroinitializer
137 %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
138 ret <2 x double> %res
141 define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double> %dst) {
142 ; SSE-LABEL: load_v4f64_v4i32:
144 ; SSE-NEXT: pxor %xmm3, %xmm3
145 ; SSE-NEXT: pcmpeqd %xmm0, %xmm3
146 ; SSE-NEXT: movmskps %xmm3, %eax
147 ; SSE-NEXT: testb $1, %al
148 ; SSE-NEXT: jne LBB2_1
149 ; SSE-NEXT: ## %bb.2: ## %else
150 ; SSE-NEXT: testb $2, %al
151 ; SSE-NEXT: jne LBB2_3
152 ; SSE-NEXT: LBB2_4: ## %else2
153 ; SSE-NEXT: testb $4, %al
154 ; SSE-NEXT: jne LBB2_5
155 ; SSE-NEXT: LBB2_6: ## %else5
156 ; SSE-NEXT: testb $8, %al
157 ; SSE-NEXT: je LBB2_8
158 ; SSE-NEXT: LBB2_7: ## %cond.load7
159 ; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
160 ; SSE-NEXT: LBB2_8: ## %else8
161 ; SSE-NEXT: movaps %xmm1, %xmm0
162 ; SSE-NEXT: movaps %xmm2, %xmm1
164 ; SSE-NEXT: LBB2_1: ## %cond.load
165 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
166 ; SSE-NEXT: testb $2, %al
167 ; SSE-NEXT: je LBB2_4
168 ; SSE-NEXT: LBB2_3: ## %cond.load1
169 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
170 ; SSE-NEXT: testb $4, %al
171 ; SSE-NEXT: je LBB2_6
172 ; SSE-NEXT: LBB2_5: ## %cond.load4
173 ; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
174 ; SSE-NEXT: testb $8, %al
175 ; SSE-NEXT: jne LBB2_7
176 ; SSE-NEXT: jmp LBB2_8
178 ; AVX1-LABEL: load_v4f64_v4i32:
180 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
181 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
182 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
183 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
184 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
185 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
186 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
189 ; AVX2-LABEL: load_v4f64_v4i32:
191 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
192 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
193 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
194 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
195 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
198 ; AVX512F-LABEL: load_v4f64_v4i32:
200 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
201 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
202 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
203 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
204 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
205 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
206 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
209 ; AVX512VL-LABEL: load_v4f64_v4i32:
210 ; AVX512VL: ## %bb.0:
211 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
212 ; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
213 ; AVX512VL-NEXT: retq
215 ; X86-AVX512-LABEL: load_v4f64_v4i32:
216 ; X86-AVX512: ## %bb.0:
217 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
218 ; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
219 ; X86-AVX512-NEXT: vblendmpd (%eax), %ymm1, %ymm0 {%k1}
220 ; X86-AVX512-NEXT: retl
221 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
222 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 32, <4 x i1> %mask, <4 x double> %dst)
223 ret <4 x double> %res
226 define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
227 ; SSE-LABEL: load_v4f64_v4i32_zero:
229 ; SSE-NEXT: movdqa %xmm0, %xmm1
230 ; SSE-NEXT: pxor %xmm0, %xmm0
231 ; SSE-NEXT: pcmpeqd %xmm0, %xmm1
232 ; SSE-NEXT: movmskps %xmm1, %eax
233 ; SSE-NEXT: testb $1, %al
234 ; SSE-NEXT: xorps %xmm1, %xmm1
235 ; SSE-NEXT: jne LBB3_1
236 ; SSE-NEXT: ## %bb.2: ## %else
237 ; SSE-NEXT: testb $2, %al
238 ; SSE-NEXT: jne LBB3_3
239 ; SSE-NEXT: LBB3_4: ## %else2
240 ; SSE-NEXT: testb $4, %al
241 ; SSE-NEXT: jne LBB3_5
242 ; SSE-NEXT: LBB3_6: ## %else5
243 ; SSE-NEXT: testb $8, %al
244 ; SSE-NEXT: jne LBB3_7
245 ; SSE-NEXT: LBB3_8: ## %else8
247 ; SSE-NEXT: LBB3_1: ## %cond.load
248 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
249 ; SSE-NEXT: testb $2, %al
250 ; SSE-NEXT: je LBB3_4
251 ; SSE-NEXT: LBB3_3: ## %cond.load1
252 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
253 ; SSE-NEXT: testb $4, %al
254 ; SSE-NEXT: je LBB3_6
255 ; SSE-NEXT: LBB3_5: ## %cond.load4
256 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
257 ; SSE-NEXT: testb $8, %al
258 ; SSE-NEXT: je LBB3_8
259 ; SSE-NEXT: LBB3_7: ## %cond.load7
260 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
263 ; AVX1-LABEL: load_v4f64_v4i32_zero:
265 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
266 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
267 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
268 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
269 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
270 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
273 ; AVX2-LABEL: load_v4f64_v4i32_zero:
275 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
276 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
277 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
278 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
281 ; AVX512F-LABEL: load_v4f64_v4i32_zero:
283 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
284 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
285 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
286 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
287 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
288 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
291 ; AVX512VL-LABEL: load_v4f64_v4i32_zero:
292 ; AVX512VL: ## %bb.0:
293 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
294 ; AVX512VL-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
295 ; AVX512VL-NEXT: retq
297 ; X86-AVX512-LABEL: load_v4f64_v4i32_zero:
298 ; X86-AVX512: ## %bb.0:
299 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
300 ; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
301 ; X86-AVX512-NEXT: vmovapd (%eax), %ymm0 {%k1} {z}
302 ; X86-AVX512-NEXT: retl
303 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
304 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer)
305 ret <4 x double> %res
308 define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %dst) {
309 ; SSE2-LABEL: load_v4f64_v4i64:
311 ; SSE2-NEXT: pxor %xmm4, %xmm4
312 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
313 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
314 ; SSE2-NEXT: movdqa %xmm0, %xmm4
315 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3]
316 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
317 ; SSE2-NEXT: andps %xmm4, %xmm0
318 ; SSE2-NEXT: movmskps %xmm0, %eax
319 ; SSE2-NEXT: testb $1, %al
320 ; SSE2-NEXT: jne LBB4_1
321 ; SSE2-NEXT: ## %bb.2: ## %else
322 ; SSE2-NEXT: testb $2, %al
323 ; SSE2-NEXT: jne LBB4_3
324 ; SSE2-NEXT: LBB4_4: ## %else2
325 ; SSE2-NEXT: testb $4, %al
326 ; SSE2-NEXT: jne LBB4_5
327 ; SSE2-NEXT: LBB4_6: ## %else5
328 ; SSE2-NEXT: testb $8, %al
329 ; SSE2-NEXT: je LBB4_8
330 ; SSE2-NEXT: LBB4_7: ## %cond.load7
331 ; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
332 ; SSE2-NEXT: LBB4_8: ## %else8
333 ; SSE2-NEXT: movaps %xmm2, %xmm0
334 ; SSE2-NEXT: movaps %xmm3, %xmm1
336 ; SSE2-NEXT: LBB4_1: ## %cond.load
337 ; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
338 ; SSE2-NEXT: testb $2, %al
339 ; SSE2-NEXT: je LBB4_4
340 ; SSE2-NEXT: LBB4_3: ## %cond.load1
341 ; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
342 ; SSE2-NEXT: testb $4, %al
343 ; SSE2-NEXT: je LBB4_6
344 ; SSE2-NEXT: LBB4_5: ## %cond.load4
345 ; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
346 ; SSE2-NEXT: testb $8, %al
347 ; SSE2-NEXT: jne LBB4_7
348 ; SSE2-NEXT: jmp LBB4_8
350 ; SSE42-LABEL: load_v4f64_v4i64:
352 ; SSE42-NEXT: pxor %xmm4, %xmm4
353 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm1
354 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm0
355 ; SSE42-NEXT: packssdw %xmm1, %xmm0
356 ; SSE42-NEXT: movmskps %xmm0, %eax
357 ; SSE42-NEXT: testb $1, %al
358 ; SSE42-NEXT: jne LBB4_1
359 ; SSE42-NEXT: ## %bb.2: ## %else
360 ; SSE42-NEXT: testb $2, %al
361 ; SSE42-NEXT: jne LBB4_3
362 ; SSE42-NEXT: LBB4_4: ## %else2
363 ; SSE42-NEXT: testb $4, %al
364 ; SSE42-NEXT: jne LBB4_5
365 ; SSE42-NEXT: LBB4_6: ## %else5
366 ; SSE42-NEXT: testb $8, %al
367 ; SSE42-NEXT: je LBB4_8
368 ; SSE42-NEXT: LBB4_7: ## %cond.load7
369 ; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
370 ; SSE42-NEXT: LBB4_8: ## %else8
371 ; SSE42-NEXT: movaps %xmm2, %xmm0
372 ; SSE42-NEXT: movaps %xmm3, %xmm1
374 ; SSE42-NEXT: LBB4_1: ## %cond.load
375 ; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
376 ; SSE42-NEXT: testb $2, %al
377 ; SSE42-NEXT: je LBB4_4
378 ; SSE42-NEXT: LBB4_3: ## %cond.load1
379 ; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
380 ; SSE42-NEXT: testb $4, %al
381 ; SSE42-NEXT: je LBB4_6
382 ; SSE42-NEXT: LBB4_5: ## %cond.load4
383 ; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
384 ; SSE42-NEXT: testb $8, %al
385 ; SSE42-NEXT: jne LBB4_7
386 ; SSE42-NEXT: jmp LBB4_8
388 ; AVX1-LABEL: load_v4f64_v4i64:
390 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
391 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
392 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
393 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
394 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
395 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
396 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
399 ; AVX2-LABEL: load_v4f64_v4i64:
401 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
402 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0
403 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
404 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
407 ; AVX512F-LABEL: load_v4f64_v4i64:
409 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
410 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
411 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
412 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
413 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
414 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
415 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
418 ; AVX512VL-LABEL: load_v4f64_v4i64:
419 ; AVX512VL: ## %bb.0:
420 ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
421 ; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
422 ; AVX512VL-NEXT: retq
424 ; X86-AVX512-LABEL: load_v4f64_v4i64:
425 ; X86-AVX512: ## %bb.0:
426 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
427 ; X86-AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k1
428 ; X86-AVX512-NEXT: vblendmpd (%eax), %ymm1, %ymm0 {%k1}
429 ; X86-AVX512-NEXT: retl
430 %mask = icmp eq <4 x i64> %trigger, zeroinitializer
431 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x double> %dst)
432 ret <4 x double> %res
435 define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x double> %dst) {
436 ; SSE-LABEL: load_v8f64_v8i16:
438 ; SSE-NEXT: pxor %xmm5, %xmm5
439 ; SSE-NEXT: pcmpeqw %xmm0, %xmm5
440 ; SSE-NEXT: packsswb %xmm5, %xmm5
441 ; SSE-NEXT: pmovmskb %xmm5, %eax
442 ; SSE-NEXT: testb $1, %al
443 ; SSE-NEXT: jne LBB5_1
444 ; SSE-NEXT: ## %bb.2: ## %else
445 ; SSE-NEXT: testb $2, %al
446 ; SSE-NEXT: jne LBB5_3
447 ; SSE-NEXT: LBB5_4: ## %else2
448 ; SSE-NEXT: testb $4, %al
449 ; SSE-NEXT: jne LBB5_5
450 ; SSE-NEXT: LBB5_6: ## %else5
451 ; SSE-NEXT: testb $8, %al
452 ; SSE-NEXT: jne LBB5_7
453 ; SSE-NEXT: LBB5_8: ## %else8
454 ; SSE-NEXT: testb $16, %al
455 ; SSE-NEXT: jne LBB5_9
456 ; SSE-NEXT: LBB5_10: ## %else11
457 ; SSE-NEXT: testb $32, %al
458 ; SSE-NEXT: jne LBB5_11
459 ; SSE-NEXT: LBB5_12: ## %else14
460 ; SSE-NEXT: testb $64, %al
461 ; SSE-NEXT: jne LBB5_13
462 ; SSE-NEXT: LBB5_14: ## %else17
463 ; SSE-NEXT: testb $-128, %al
464 ; SSE-NEXT: je LBB5_16
465 ; SSE-NEXT: LBB5_15: ## %cond.load19
466 ; SSE-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
467 ; SSE-NEXT: LBB5_16: ## %else20
468 ; SSE-NEXT: movaps %xmm1, %xmm0
469 ; SSE-NEXT: movaps %xmm2, %xmm1
470 ; SSE-NEXT: movaps %xmm3, %xmm2
471 ; SSE-NEXT: movaps %xmm4, %xmm3
473 ; SSE-NEXT: LBB5_1: ## %cond.load
474 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
475 ; SSE-NEXT: testb $2, %al
476 ; SSE-NEXT: je LBB5_4
477 ; SSE-NEXT: LBB5_3: ## %cond.load1
478 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
479 ; SSE-NEXT: testb $4, %al
480 ; SSE-NEXT: je LBB5_6
481 ; SSE-NEXT: LBB5_5: ## %cond.load4
482 ; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
483 ; SSE-NEXT: testb $8, %al
484 ; SSE-NEXT: je LBB5_8
485 ; SSE-NEXT: LBB5_7: ## %cond.load7
486 ; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
487 ; SSE-NEXT: testb $16, %al
488 ; SSE-NEXT: je LBB5_10
489 ; SSE-NEXT: LBB5_9: ## %cond.load10
490 ; SSE-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
491 ; SSE-NEXT: testb $32, %al
492 ; SSE-NEXT: je LBB5_12
493 ; SSE-NEXT: LBB5_11: ## %cond.load13
494 ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
495 ; SSE-NEXT: testb $64, %al
496 ; SSE-NEXT: je LBB5_14
497 ; SSE-NEXT: LBB5_13: ## %cond.load16
498 ; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
499 ; SSE-NEXT: testb $-128, %al
500 ; SSE-NEXT: jne LBB5_15
501 ; SSE-NEXT: jmp LBB5_16
503 ; AVX1-LABEL: load_v8f64_v8i16:
505 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
506 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
507 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
508 ; AVX1-NEXT: vpmovsxwq %xmm3, %xmm5
509 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
510 ; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3
511 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
512 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
513 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
514 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
515 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
516 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
517 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
518 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
519 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
520 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
523 ; AVX2-LABEL: load_v8f64_v8i16:
525 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
526 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
527 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
528 ; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3
529 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
530 ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
531 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
532 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
533 ; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
534 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
537 ; AVX512F-LABEL: load_v8f64_v8i16:
539 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
540 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
541 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
542 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
543 ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
546 ; AVX512VLDQ-LABEL: load_v8f64_v8i16:
547 ; AVX512VLDQ: ## %bb.0:
548 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
549 ; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
550 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
551 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
552 ; AVX512VLDQ-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
553 ; AVX512VLDQ-NEXT: retq
555 ; AVX512VLBW-LABEL: load_v8f64_v8i16:
556 ; AVX512VLBW: ## %bb.0:
557 ; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
558 ; AVX512VLBW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
559 ; AVX512VLBW-NEXT: retq
561 ; X86-AVX512-LABEL: load_v8f64_v8i16:
562 ; X86-AVX512: ## %bb.0:
563 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
564 ; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1
565 ; X86-AVX512-NEXT: vblendmpd (%eax), %zmm1, %zmm0 {%k1}
566 ; X86-AVX512-NEXT: retl
567 %mask = icmp eq <8 x i16> %trigger, zeroinitializer
568 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
569 ret <8 x double> %res
572 define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double> %dst) {
573 ; SSE2-LABEL: load_v8f64_v8i64:
575 ; SSE2-NEXT: pxor %xmm8, %xmm8
576 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
577 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2]
578 ; SSE2-NEXT: pand %xmm3, %xmm9
579 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
580 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
581 ; SSE2-NEXT: pand %xmm2, %xmm3
582 ; SSE2-NEXT: packssdw %xmm9, %xmm3
583 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
584 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
585 ; SSE2-NEXT: pand %xmm1, %xmm2
586 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
587 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
588 ; SSE2-NEXT: pand %xmm0, %xmm1
589 ; SSE2-NEXT: packssdw %xmm2, %xmm1
590 ; SSE2-NEXT: packssdw %xmm3, %xmm1
591 ; SSE2-NEXT: packsswb %xmm1, %xmm1
592 ; SSE2-NEXT: pmovmskb %xmm1, %eax
593 ; SSE2-NEXT: testb $1, %al
594 ; SSE2-NEXT: jne LBB6_1
595 ; SSE2-NEXT: ## %bb.2: ## %else
596 ; SSE2-NEXT: testb $2, %al
597 ; SSE2-NEXT: jne LBB6_3
598 ; SSE2-NEXT: LBB6_4: ## %else2
599 ; SSE2-NEXT: testb $4, %al
600 ; SSE2-NEXT: jne LBB6_5
601 ; SSE2-NEXT: LBB6_6: ## %else5
602 ; SSE2-NEXT: testb $8, %al
603 ; SSE2-NEXT: jne LBB6_7
604 ; SSE2-NEXT: LBB6_8: ## %else8
605 ; SSE2-NEXT: testb $16, %al
606 ; SSE2-NEXT: jne LBB6_9
607 ; SSE2-NEXT: LBB6_10: ## %else11
608 ; SSE2-NEXT: testb $32, %al
609 ; SSE2-NEXT: jne LBB6_11
610 ; SSE2-NEXT: LBB6_12: ## %else14
611 ; SSE2-NEXT: testb $64, %al
612 ; SSE2-NEXT: jne LBB6_13
613 ; SSE2-NEXT: LBB6_14: ## %else17
614 ; SSE2-NEXT: testb $-128, %al
615 ; SSE2-NEXT: je LBB6_16
616 ; SSE2-NEXT: LBB6_15: ## %cond.load19
617 ; SSE2-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
618 ; SSE2-NEXT: LBB6_16: ## %else20
619 ; SSE2-NEXT: movaps %xmm4, %xmm0
620 ; SSE2-NEXT: movaps %xmm5, %xmm1
621 ; SSE2-NEXT: movaps %xmm6, %xmm2
622 ; SSE2-NEXT: movaps %xmm7, %xmm3
624 ; SSE2-NEXT: LBB6_1: ## %cond.load
625 ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
626 ; SSE2-NEXT: testb $2, %al
627 ; SSE2-NEXT: je LBB6_4
628 ; SSE2-NEXT: LBB6_3: ## %cond.load1
629 ; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
630 ; SSE2-NEXT: testb $4, %al
631 ; SSE2-NEXT: je LBB6_6
632 ; SSE2-NEXT: LBB6_5: ## %cond.load4
633 ; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
634 ; SSE2-NEXT: testb $8, %al
635 ; SSE2-NEXT: je LBB6_8
636 ; SSE2-NEXT: LBB6_7: ## %cond.load7
637 ; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
638 ; SSE2-NEXT: testb $16, %al
639 ; SSE2-NEXT: je LBB6_10
640 ; SSE2-NEXT: LBB6_9: ## %cond.load10
641 ; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
642 ; SSE2-NEXT: testb $32, %al
643 ; SSE2-NEXT: je LBB6_12
644 ; SSE2-NEXT: LBB6_11: ## %cond.load13
645 ; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
646 ; SSE2-NEXT: testb $64, %al
647 ; SSE2-NEXT: je LBB6_14
648 ; SSE2-NEXT: LBB6_13: ## %cond.load16
649 ; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
650 ; SSE2-NEXT: testb $-128, %al
651 ; SSE2-NEXT: jne LBB6_15
652 ; SSE2-NEXT: jmp LBB6_16
654 ; SSE42-LABEL: load_v8f64_v8i64:
656 ; SSE42-NEXT: pxor %xmm8, %xmm8
657 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm3
658 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm2
659 ; SSE42-NEXT: packssdw %xmm3, %xmm2
660 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm1
661 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm0
662 ; SSE42-NEXT: packssdw %xmm1, %xmm0
663 ; SSE42-NEXT: packssdw %xmm2, %xmm0
664 ; SSE42-NEXT: packsswb %xmm0, %xmm0
665 ; SSE42-NEXT: pmovmskb %xmm0, %eax
666 ; SSE42-NEXT: testb $1, %al
667 ; SSE42-NEXT: jne LBB6_1
668 ; SSE42-NEXT: ## %bb.2: ## %else
669 ; SSE42-NEXT: testb $2, %al
670 ; SSE42-NEXT: jne LBB6_3
671 ; SSE42-NEXT: LBB6_4: ## %else2
672 ; SSE42-NEXT: testb $4, %al
673 ; SSE42-NEXT: jne LBB6_5
674 ; SSE42-NEXT: LBB6_6: ## %else5
675 ; SSE42-NEXT: testb $8, %al
676 ; SSE42-NEXT: jne LBB6_7
677 ; SSE42-NEXT: LBB6_8: ## %else8
678 ; SSE42-NEXT: testb $16, %al
679 ; SSE42-NEXT: jne LBB6_9
680 ; SSE42-NEXT: LBB6_10: ## %else11
681 ; SSE42-NEXT: testb $32, %al
682 ; SSE42-NEXT: jne LBB6_11
683 ; SSE42-NEXT: LBB6_12: ## %else14
684 ; SSE42-NEXT: testb $64, %al
685 ; SSE42-NEXT: jne LBB6_13
686 ; SSE42-NEXT: LBB6_14: ## %else17
687 ; SSE42-NEXT: testb $-128, %al
688 ; SSE42-NEXT: je LBB6_16
689 ; SSE42-NEXT: LBB6_15: ## %cond.load19
690 ; SSE42-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
691 ; SSE42-NEXT: LBB6_16: ## %else20
692 ; SSE42-NEXT: movaps %xmm4, %xmm0
693 ; SSE42-NEXT: movaps %xmm5, %xmm1
694 ; SSE42-NEXT: movaps %xmm6, %xmm2
695 ; SSE42-NEXT: movaps %xmm7, %xmm3
697 ; SSE42-NEXT: LBB6_1: ## %cond.load
698 ; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
699 ; SSE42-NEXT: testb $2, %al
700 ; SSE42-NEXT: je LBB6_4
701 ; SSE42-NEXT: LBB6_3: ## %cond.load1
702 ; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
703 ; SSE42-NEXT: testb $4, %al
704 ; SSE42-NEXT: je LBB6_6
705 ; SSE42-NEXT: LBB6_5: ## %cond.load4
706 ; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
707 ; SSE42-NEXT: testb $8, %al
708 ; SSE42-NEXT: je LBB6_8
709 ; SSE42-NEXT: LBB6_7: ## %cond.load7
710 ; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
711 ; SSE42-NEXT: testb $16, %al
712 ; SSE42-NEXT: je LBB6_10
713 ; SSE42-NEXT: LBB6_9: ## %cond.load10
714 ; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
715 ; SSE42-NEXT: testb $32, %al
716 ; SSE42-NEXT: je LBB6_12
717 ; SSE42-NEXT: LBB6_11: ## %cond.load13
718 ; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
719 ; SSE42-NEXT: testb $64, %al
720 ; SSE42-NEXT: je LBB6_14
721 ; SSE42-NEXT: LBB6_13: ## %cond.load16
722 ; SSE42-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
723 ; SSE42-NEXT: testb $-128, %al
724 ; SSE42-NEXT: jne LBB6_15
725 ; SSE42-NEXT: jmp LBB6_16
727 ; AVX1-LABEL: load_v8f64_v8i64:
729 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
730 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
731 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
732 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
733 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
734 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
735 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
736 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
737 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
738 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
739 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
740 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
741 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
744 ; AVX2-LABEL: load_v8f64_v8i64:
746 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
747 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
748 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
749 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
750 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
751 ; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
752 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
755 ; AVX512-LABEL: load_v8f64_v8i64:
757 ; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
758 ; AVX512-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
761 ; X86-AVX512-LABEL: load_v8f64_v8i64:
762 ; X86-AVX512: ## %bb.0:
763 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
764 ; X86-AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
765 ; X86-AVX512-NEXT: vblendmpd (%eax), %zmm1, %zmm0 {%k1}
766 ; X86-AVX512-NEXT: retl
767 %mask = icmp eq <8 x i64> %trigger, zeroinitializer
768 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
769 ret <8 x double> %res
776 define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
777 ; SSE2-LABEL: load_v2f32_v2i32:
779 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
780 ; SSE2-NEXT: pxor %xmm2, %xmm2
781 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
782 ; SSE2-NEXT: movmskpd %xmm2, %eax
783 ; SSE2-NEXT: testb $1, %al
784 ; SSE2-NEXT: jne LBB7_1
785 ; SSE2-NEXT: ## %bb.2: ## %else
786 ; SSE2-NEXT: testb $2, %al
787 ; SSE2-NEXT: jne LBB7_3
788 ; SSE2-NEXT: LBB7_4: ## %else2
789 ; SSE2-NEXT: movaps %xmm1, %xmm0
791 ; SSE2-NEXT: LBB7_1: ## %cond.load
792 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
793 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
794 ; SSE2-NEXT: testb $2, %al
795 ; SSE2-NEXT: je LBB7_4
796 ; SSE2-NEXT: LBB7_3: ## %cond.load1
797 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
798 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
799 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
800 ; SSE2-NEXT: movaps %xmm0, %xmm1
801 ; SSE2-NEXT: movaps %xmm1, %xmm0
804 ; SSE42-LABEL: load_v2f32_v2i32:
806 ; SSE42-NEXT: pxor %xmm2, %xmm2
807 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
808 ; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
809 ; SSE42-NEXT: movmskpd %xmm0, %eax
810 ; SSE42-NEXT: testb $1, %al
811 ; SSE42-NEXT: jne LBB7_1
812 ; SSE42-NEXT: ## %bb.2: ## %else
813 ; SSE42-NEXT: testb $2, %al
814 ; SSE42-NEXT: jne LBB7_3
815 ; SSE42-NEXT: LBB7_4: ## %else2
816 ; SSE42-NEXT: movaps %xmm1, %xmm0
818 ; SSE42-NEXT: LBB7_1: ## %cond.load
819 ; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
820 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
821 ; SSE42-NEXT: testb $2, %al
822 ; SSE42-NEXT: je LBB7_4
823 ; SSE42-NEXT: LBB7_3: ## %cond.load1
824 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
825 ; SSE42-NEXT: movaps %xmm1, %xmm0
828 ; AVX1OR2-LABEL: load_v2f32_v2i32:
830 ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
831 ; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
832 ; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
833 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
834 ; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
837 ; AVX512F-LABEL: load_v2f32_v2i32:
839 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
840 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
841 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
842 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
843 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
844 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
845 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
846 ; AVX512F-NEXT: vzeroupper
849 ; AVX512VLDQ-LABEL: load_v2f32_v2i32:
850 ; AVX512VLDQ: ## %bb.0:
851 ; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
852 ; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
853 ; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
854 ; AVX512VLDQ-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
855 ; AVX512VLDQ-NEXT: retq
857 ; AVX512VLBW-LABEL: load_v2f32_v2i32:
858 ; AVX512VLBW: ## %bb.0:
859 ; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
860 ; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
861 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
862 ; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
863 ; AVX512VLBW-NEXT: retq
865 ; X86-AVX512-LABEL: load_v2f32_v2i32:
866 ; X86-AVX512: ## %bb.0:
867 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
868 ; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
869 ; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
870 ; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
871 ; X86-AVX512-NEXT: vblendmps (%eax), %xmm1, %xmm0 {%k1}
872 ; X86-AVX512-NEXT: retl
873 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
874 %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
878 define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, ptr %addr) {
879 ; SSE2-LABEL: load_v2f32_v2i32_undef:
881 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
882 ; SSE2-NEXT: pxor %xmm1, %xmm1
883 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
884 ; SSE2-NEXT: movmskpd %xmm1, %eax
885 ; SSE2-NEXT: testb $1, %al
886 ; SSE2-NEXT: ## implicit-def: $xmm0
887 ; SSE2-NEXT: jne LBB8_1
888 ; SSE2-NEXT: ## %bb.2: ## %else
889 ; SSE2-NEXT: testb $2, %al
890 ; SSE2-NEXT: jne LBB8_3
891 ; SSE2-NEXT: LBB8_4: ## %else2
893 ; SSE2-NEXT: LBB8_1: ## %cond.load
894 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
895 ; SSE2-NEXT: testb $2, %al
896 ; SSE2-NEXT: je LBB8_4
897 ; SSE2-NEXT: LBB8_3: ## %cond.load1
898 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
899 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
900 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
901 ; SSE2-NEXT: movaps %xmm1, %xmm0
904 ; SSE42-LABEL: load_v2f32_v2i32_undef:
906 ; SSE42-NEXT: pxor %xmm1, %xmm1
907 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm1
908 ; SSE42-NEXT: pmovsxdq %xmm1, %xmm0
909 ; SSE42-NEXT: movmskpd %xmm0, %eax
910 ; SSE42-NEXT: testb $1, %al
911 ; SSE42-NEXT: ## implicit-def: $xmm0
912 ; SSE42-NEXT: jne LBB8_1
913 ; SSE42-NEXT: ## %bb.2: ## %else
914 ; SSE42-NEXT: testb $2, %al
915 ; SSE42-NEXT: jne LBB8_3
916 ; SSE42-NEXT: LBB8_4: ## %else2
918 ; SSE42-NEXT: LBB8_1: ## %cond.load
919 ; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
920 ; SSE42-NEXT: testb $2, %al
921 ; SSE42-NEXT: je LBB8_4
922 ; SSE42-NEXT: LBB8_3: ## %cond.load1
923 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
926 ; AVX1OR2-LABEL: load_v2f32_v2i32_undef:
928 ; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
929 ; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
930 ; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
931 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
934 ; AVX512F-LABEL: load_v2f32_v2i32_undef:
936 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
937 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
938 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
939 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
940 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
941 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
942 ; AVX512F-NEXT: vzeroupper
945 ; AVX512VLDQ-LABEL: load_v2f32_v2i32_undef:
946 ; AVX512VLDQ: ## %bb.0:
947 ; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
948 ; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
949 ; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
950 ; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
951 ; AVX512VLDQ-NEXT: retq
953 ; AVX512VLBW-LABEL: load_v2f32_v2i32_undef:
954 ; AVX512VLBW: ## %bb.0:
955 ; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
956 ; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
957 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
958 ; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
959 ; AVX512VLBW-NEXT: retq
961 ; X86-AVX512-LABEL: load_v2f32_v2i32_undef:
962 ; X86-AVX512: ## %bb.0:
963 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
964 ; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
965 ; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
966 ; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
967 ; X86-AVX512-NEXT: vmovups (%eax), %xmm0 {%k1} {z}
968 ; X86-AVX512-NEXT: retl
969 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
970 %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float>undef)
974 define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x float> %dst) {
975 ; SSE2-LABEL: load_v4f32_v4i32:
977 ; SSE2-NEXT: pxor %xmm2, %xmm2
978 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
979 ; SSE2-NEXT: movmskps %xmm2, %eax
980 ; SSE2-NEXT: testb $1, %al
981 ; SSE2-NEXT: jne LBB9_1
982 ; SSE2-NEXT: ## %bb.2: ## %else
983 ; SSE2-NEXT: testb $2, %al
984 ; SSE2-NEXT: jne LBB9_3
985 ; SSE2-NEXT: LBB9_4: ## %else2
986 ; SSE2-NEXT: testb $4, %al
987 ; SSE2-NEXT: jne LBB9_5
988 ; SSE2-NEXT: LBB9_6: ## %else5
989 ; SSE2-NEXT: testb $8, %al
990 ; SSE2-NEXT: jne LBB9_7
991 ; SSE2-NEXT: LBB9_8: ## %else8
992 ; SSE2-NEXT: movaps %xmm1, %xmm0
994 ; SSE2-NEXT: LBB9_1: ## %cond.load
995 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
996 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
997 ; SSE2-NEXT: testb $2, %al
998 ; SSE2-NEXT: je LBB9_4
999 ; SSE2-NEXT: LBB9_3: ## %cond.load1
1000 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1001 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1002 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1003 ; SSE2-NEXT: movaps %xmm0, %xmm1
1004 ; SSE2-NEXT: testb $4, %al
1005 ; SSE2-NEXT: je LBB9_6
1006 ; SSE2-NEXT: LBB9_5: ## %cond.load4
1007 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1008 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1009 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
1010 ; SSE2-NEXT: testb $8, %al
1011 ; SSE2-NEXT: je LBB9_8
1012 ; SSE2-NEXT: LBB9_7: ## %cond.load7
1013 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1014 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1015 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1016 ; SSE2-NEXT: movaps %xmm1, %xmm0
1019 ; SSE42-LABEL: load_v4f32_v4i32:
1021 ; SSE42-NEXT: pxor %xmm2, %xmm2
1022 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
1023 ; SSE42-NEXT: movmskps %xmm2, %eax
1024 ; SSE42-NEXT: testb $1, %al
1025 ; SSE42-NEXT: jne LBB9_1
1026 ; SSE42-NEXT: ## %bb.2: ## %else
1027 ; SSE42-NEXT: testb $2, %al
1028 ; SSE42-NEXT: jne LBB9_3
1029 ; SSE42-NEXT: LBB9_4: ## %else2
1030 ; SSE42-NEXT: testb $4, %al
1031 ; SSE42-NEXT: jne LBB9_5
1032 ; SSE42-NEXT: LBB9_6: ## %else5
1033 ; SSE42-NEXT: testb $8, %al
1034 ; SSE42-NEXT: jne LBB9_7
1035 ; SSE42-NEXT: LBB9_8: ## %else8
1036 ; SSE42-NEXT: movaps %xmm1, %xmm0
1038 ; SSE42-NEXT: LBB9_1: ## %cond.load
1039 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1040 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1041 ; SSE42-NEXT: testb $2, %al
1042 ; SSE42-NEXT: je LBB9_4
1043 ; SSE42-NEXT: LBB9_3: ## %cond.load1
1044 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
1045 ; SSE42-NEXT: testb $4, %al
1046 ; SSE42-NEXT: je LBB9_6
1047 ; SSE42-NEXT: LBB9_5: ## %cond.load4
1048 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
1049 ; SSE42-NEXT: testb $8, %al
1050 ; SSE42-NEXT: je LBB9_8
1051 ; SSE42-NEXT: LBB9_7: ## %cond.load7
1052 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
1053 ; SSE42-NEXT: movaps %xmm1, %xmm0
1056 ; AVX1OR2-LABEL: load_v4f32_v4i32:
1057 ; AVX1OR2: ## %bb.0:
1058 ; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1059 ; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
1060 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
1061 ; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
1062 ; AVX1OR2-NEXT: retq
1064 ; AVX512F-LABEL: load_v4f32_v4i32:
1065 ; AVX512F: ## %bb.0:
1066 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1067 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1068 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
1069 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
1070 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1071 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
1072 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1073 ; AVX512F-NEXT: vzeroupper
1074 ; AVX512F-NEXT: retq
1076 ; AVX512VL-LABEL: load_v4f32_v4i32:
1077 ; AVX512VL: ## %bb.0:
1078 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
1079 ; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
1080 ; AVX512VL-NEXT: retq
1082 ; X86-AVX512-LABEL: load_v4f32_v4i32:
1083 ; X86-AVX512: ## %bb.0:
1084 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1085 ; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
1086 ; X86-AVX512-NEXT: vblendmps (%eax), %xmm1, %xmm0 {%k1}
1087 ; X86-AVX512-NEXT: retl
1088 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
1089 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x float> %dst)
1090 ret <4 x float> %res
1093 define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
1094 ; SSE2-LABEL: load_v8f32_v8i1_zero:
1096 ; SSE2-NEXT: psllw $15, %xmm0
1097 ; SSE2-NEXT: packsswb %xmm0, %xmm0
1098 ; SSE2-NEXT: pmovmskb %xmm0, %eax
1099 ; SSE2-NEXT: pxor %xmm0, %xmm0
1100 ; SSE2-NEXT: testb $1, %al
1101 ; SSE2-NEXT: xorps %xmm1, %xmm1
1102 ; SSE2-NEXT: jne LBB10_1
1103 ; SSE2-NEXT: ## %bb.2: ## %else
1104 ; SSE2-NEXT: testb $2, %al
1105 ; SSE2-NEXT: jne LBB10_3
1106 ; SSE2-NEXT: LBB10_4: ## %else2
1107 ; SSE2-NEXT: testb $4, %al
1108 ; SSE2-NEXT: jne LBB10_5
1109 ; SSE2-NEXT: LBB10_6: ## %else5
1110 ; SSE2-NEXT: testb $8, %al
1111 ; SSE2-NEXT: jne LBB10_7
1112 ; SSE2-NEXT: LBB10_8: ## %else8
1113 ; SSE2-NEXT: testb $16, %al
1114 ; SSE2-NEXT: jne LBB10_9
1115 ; SSE2-NEXT: LBB10_10: ## %else11
1116 ; SSE2-NEXT: testb $32, %al
1117 ; SSE2-NEXT: jne LBB10_11
1118 ; SSE2-NEXT: LBB10_12: ## %else14
1119 ; SSE2-NEXT: testb $64, %al
1120 ; SSE2-NEXT: jne LBB10_13
1121 ; SSE2-NEXT: LBB10_14: ## %else17
1122 ; SSE2-NEXT: testb $-128, %al
1123 ; SSE2-NEXT: jne LBB10_15
1124 ; SSE2-NEXT: LBB10_16: ## %else20
1126 ; SSE2-NEXT: LBB10_1: ## %cond.load
1127 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1128 ; SSE2-NEXT: testb $2, %al
1129 ; SSE2-NEXT: je LBB10_4
1130 ; SSE2-NEXT: LBB10_3: ## %cond.load1
1131 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1132 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1133 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
1134 ; SSE2-NEXT: movaps %xmm2, %xmm0
1135 ; SSE2-NEXT: testb $4, %al
1136 ; SSE2-NEXT: je LBB10_6
1137 ; SSE2-NEXT: LBB10_5: ## %cond.load4
1138 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1139 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
1140 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
1141 ; SSE2-NEXT: testb $8, %al
1142 ; SSE2-NEXT: je LBB10_8
1143 ; SSE2-NEXT: LBB10_7: ## %cond.load7
1144 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1145 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1146 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1147 ; SSE2-NEXT: testb $16, %al
1148 ; SSE2-NEXT: je LBB10_10
1149 ; SSE2-NEXT: LBB10_9: ## %cond.load10
1150 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1151 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1152 ; SSE2-NEXT: testb $32, %al
1153 ; SSE2-NEXT: je LBB10_12
1154 ; SSE2-NEXT: LBB10_11: ## %cond.load13
1155 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1156 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1157 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
1158 ; SSE2-NEXT: movaps %xmm2, %xmm1
1159 ; SSE2-NEXT: testb $64, %al
1160 ; SSE2-NEXT: je LBB10_14
1161 ; SSE2-NEXT: LBB10_13: ## %cond.load16
1162 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1163 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
1164 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
1165 ; SSE2-NEXT: testb $-128, %al
1166 ; SSE2-NEXT: je LBB10_16
1167 ; SSE2-NEXT: LBB10_15: ## %cond.load19
1168 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1169 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
1170 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1173 ; SSE42-LABEL: load_v8f32_v8i1_zero:
1175 ; SSE42-NEXT: psllw $15, %xmm0
1176 ; SSE42-NEXT: packsswb %xmm0, %xmm0
1177 ; SSE42-NEXT: pmovmskb %xmm0, %eax
1178 ; SSE42-NEXT: pxor %xmm0, %xmm0
1179 ; SSE42-NEXT: testb $1, %al
1180 ; SSE42-NEXT: xorps %xmm1, %xmm1
1181 ; SSE42-NEXT: jne LBB10_1
1182 ; SSE42-NEXT: ## %bb.2: ## %else
1183 ; SSE42-NEXT: testb $2, %al
1184 ; SSE42-NEXT: jne LBB10_3
1185 ; SSE42-NEXT: LBB10_4: ## %else2
1186 ; SSE42-NEXT: testb $4, %al
1187 ; SSE42-NEXT: jne LBB10_5
1188 ; SSE42-NEXT: LBB10_6: ## %else5
1189 ; SSE42-NEXT: testb $8, %al
1190 ; SSE42-NEXT: jne LBB10_7
1191 ; SSE42-NEXT: LBB10_8: ## %else8
1192 ; SSE42-NEXT: testb $16, %al
1193 ; SSE42-NEXT: jne LBB10_9
1194 ; SSE42-NEXT: LBB10_10: ## %else11
1195 ; SSE42-NEXT: testb $32, %al
1196 ; SSE42-NEXT: jne LBB10_11
1197 ; SSE42-NEXT: LBB10_12: ## %else14
1198 ; SSE42-NEXT: testb $64, %al
1199 ; SSE42-NEXT: jne LBB10_13
1200 ; SSE42-NEXT: LBB10_14: ## %else17
1201 ; SSE42-NEXT: testb $-128, %al
1202 ; SSE42-NEXT: jne LBB10_15
1203 ; SSE42-NEXT: LBB10_16: ## %else20
1205 ; SSE42-NEXT: LBB10_1: ## %cond.load
1206 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1207 ; SSE42-NEXT: testb $2, %al
1208 ; SSE42-NEXT: je LBB10_4
1209 ; SSE42-NEXT: LBB10_3: ## %cond.load1
1210 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1211 ; SSE42-NEXT: testb $4, %al
1212 ; SSE42-NEXT: je LBB10_6
1213 ; SSE42-NEXT: LBB10_5: ## %cond.load4
1214 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1215 ; SSE42-NEXT: testb $8, %al
1216 ; SSE42-NEXT: je LBB10_8
1217 ; SSE42-NEXT: LBB10_7: ## %cond.load7
1218 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1219 ; SSE42-NEXT: testb $16, %al
1220 ; SSE42-NEXT: je LBB10_10
1221 ; SSE42-NEXT: LBB10_9: ## %cond.load10
1222 ; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1223 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1224 ; SSE42-NEXT: testb $32, %al
1225 ; SSE42-NEXT: je LBB10_12
1226 ; SSE42-NEXT: LBB10_11: ## %cond.load13
1227 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
1228 ; SSE42-NEXT: testb $64, %al
1229 ; SSE42-NEXT: je LBB10_14
1230 ; SSE42-NEXT: LBB10_13: ## %cond.load16
1231 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
1232 ; SSE42-NEXT: testb $-128, %al
1233 ; SSE42-NEXT: je LBB10_16
1234 ; SSE42-NEXT: LBB10_15: ## %cond.load19
1235 ; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
1238 ; AVX1-LABEL: load_v8f32_v8i1_zero:
1240 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1241 ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
1242 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1243 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
1244 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1245 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
1248 ; AVX2-LABEL: load_v8f32_v8i1_zero:
1250 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1251 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
1252 ; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
1255 ; AVX512F-LABEL: load_v8f32_v8i1_zero:
1256 ; AVX512F: ## %bb.0:
1257 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
1258 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
1259 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
1260 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
1261 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1262 ; AVX512F-NEXT: retq
1264 ; AVX512VLDQ-LABEL: load_v8f32_v8i1_zero:
1265 ; AVX512VLDQ: ## %bb.0:
1266 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
1267 ; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
1268 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
1269 ; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
1270 ; AVX512VLDQ-NEXT: retq
1272 ; AVX512VLBW-LABEL: load_v8f32_v8i1_zero:
1273 ; AVX512VLBW: ## %bb.0:
1274 ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
1275 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
1276 ; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
1277 ; AVX512VLBW-NEXT: retq
1279 ; X86-AVX512-LABEL: load_v8f32_v8i1_zero:
1280 ; X86-AVX512: ## %bb.0:
1281 ; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0
1282 ; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
1283 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1284 ; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z}
1285 ; X86-AVX512-NEXT: retl
1286 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
1287 ret <8 x float> %res
1290 define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
1291 ; SSE2-LABEL: load_v8f32_v8i32:
1293 ; SSE2-NEXT: pxor %xmm4, %xmm4
1294 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
1295 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
1296 ; SSE2-NEXT: packssdw %xmm1, %xmm0
1297 ; SSE2-NEXT: packsswb %xmm0, %xmm0
1298 ; SSE2-NEXT: pmovmskb %xmm0, %eax
1299 ; SSE2-NEXT: testb $1, %al
1300 ; SSE2-NEXT: jne LBB11_1
1301 ; SSE2-NEXT: ## %bb.2: ## %else
1302 ; SSE2-NEXT: testb $2, %al
1303 ; SSE2-NEXT: jne LBB11_3
1304 ; SSE2-NEXT: LBB11_4: ## %else2
1305 ; SSE2-NEXT: testb $4, %al
1306 ; SSE2-NEXT: jne LBB11_5
1307 ; SSE2-NEXT: LBB11_6: ## %else5
1308 ; SSE2-NEXT: testb $8, %al
1309 ; SSE2-NEXT: jne LBB11_7
1310 ; SSE2-NEXT: LBB11_8: ## %else8
1311 ; SSE2-NEXT: testb $16, %al
1312 ; SSE2-NEXT: jne LBB11_9
1313 ; SSE2-NEXT: LBB11_10: ## %else11
1314 ; SSE2-NEXT: testb $32, %al
1315 ; SSE2-NEXT: jne LBB11_11
1316 ; SSE2-NEXT: LBB11_12: ## %else14
1317 ; SSE2-NEXT: testb $64, %al
1318 ; SSE2-NEXT: jne LBB11_13
1319 ; SSE2-NEXT: LBB11_14: ## %else17
1320 ; SSE2-NEXT: testb $-128, %al
1321 ; SSE2-NEXT: je LBB11_16
1322 ; SSE2-NEXT: LBB11_15: ## %cond.load19
1323 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1324 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
1325 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
1326 ; SSE2-NEXT: LBB11_16: ## %else20
1327 ; SSE2-NEXT: movaps %xmm2, %xmm0
1328 ; SSE2-NEXT: movaps %xmm3, %xmm1
1330 ; SSE2-NEXT: LBB11_1: ## %cond.load
1331 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1332 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
1333 ; SSE2-NEXT: testb $2, %al
1334 ; SSE2-NEXT: je LBB11_4
1335 ; SSE2-NEXT: LBB11_3: ## %cond.load1
1336 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1337 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1338 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
1339 ; SSE2-NEXT: movaps %xmm0, %xmm2
1340 ; SSE2-NEXT: testb $4, %al
1341 ; SSE2-NEXT: je LBB11_6
1342 ; SSE2-NEXT: LBB11_5: ## %cond.load4
1343 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1344 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
1345 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
1346 ; SSE2-NEXT: testb $8, %al
1347 ; SSE2-NEXT: je LBB11_8
1348 ; SSE2-NEXT: LBB11_7: ## %cond.load7
1349 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1350 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1351 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
1352 ; SSE2-NEXT: testb $16, %al
1353 ; SSE2-NEXT: je LBB11_10
1354 ; SSE2-NEXT: LBB11_9: ## %cond.load10
1355 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1356 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
1357 ; SSE2-NEXT: testb $32, %al
1358 ; SSE2-NEXT: je LBB11_12
1359 ; SSE2-NEXT: LBB11_11: ## %cond.load13
1360 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1361 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1362 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
1363 ; SSE2-NEXT: movaps %xmm0, %xmm3
1364 ; SSE2-NEXT: testb $64, %al
1365 ; SSE2-NEXT: je LBB11_14
1366 ; SSE2-NEXT: LBB11_13: ## %cond.load16
1367 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1368 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0]
1369 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
1370 ; SSE2-NEXT: testb $-128, %al
1371 ; SSE2-NEXT: jne LBB11_15
1372 ; SSE2-NEXT: jmp LBB11_16
1374 ; SSE42-LABEL: load_v8f32_v8i32:
1376 ; SSE42-NEXT: pxor %xmm4, %xmm4
1377 ; SSE42-NEXT: pcmpeqd %xmm4, %xmm1
1378 ; SSE42-NEXT: pcmpeqd %xmm4, %xmm0
1379 ; SSE42-NEXT: packssdw %xmm1, %xmm0
1380 ; SSE42-NEXT: packsswb %xmm0, %xmm0
1381 ; SSE42-NEXT: pmovmskb %xmm0, %eax
1382 ; SSE42-NEXT: testb $1, %al
1383 ; SSE42-NEXT: jne LBB11_1
1384 ; SSE42-NEXT: ## %bb.2: ## %else
1385 ; SSE42-NEXT: testb $2, %al
1386 ; SSE42-NEXT: jne LBB11_3
1387 ; SSE42-NEXT: LBB11_4: ## %else2
1388 ; SSE42-NEXT: testb $4, %al
1389 ; SSE42-NEXT: jne LBB11_5
1390 ; SSE42-NEXT: LBB11_6: ## %else5
1391 ; SSE42-NEXT: testb $8, %al
1392 ; SSE42-NEXT: jne LBB11_7
1393 ; SSE42-NEXT: LBB11_8: ## %else8
1394 ; SSE42-NEXT: testb $16, %al
1395 ; SSE42-NEXT: jne LBB11_9
1396 ; SSE42-NEXT: LBB11_10: ## %else11
1397 ; SSE42-NEXT: testb $32, %al
1398 ; SSE42-NEXT: jne LBB11_11
1399 ; SSE42-NEXT: LBB11_12: ## %else14
1400 ; SSE42-NEXT: testb $64, %al
1401 ; SSE42-NEXT: jne LBB11_13
1402 ; SSE42-NEXT: LBB11_14: ## %else17
1403 ; SSE42-NEXT: testb $-128, %al
1404 ; SSE42-NEXT: je LBB11_16
1405 ; SSE42-NEXT: LBB11_15: ## %cond.load19
1406 ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
1407 ; SSE42-NEXT: LBB11_16: ## %else20
1408 ; SSE42-NEXT: movaps %xmm2, %xmm0
1409 ; SSE42-NEXT: movaps %xmm3, %xmm1
1411 ; SSE42-NEXT: LBB11_1: ## %cond.load
1412 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1413 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1414 ; SSE42-NEXT: testb $2, %al
1415 ; SSE42-NEXT: je LBB11_4
1416 ; SSE42-NEXT: LBB11_3: ## %cond.load1
1417 ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1418 ; SSE42-NEXT: testb $4, %al
1419 ; SSE42-NEXT: je LBB11_6
1420 ; SSE42-NEXT: LBB11_5: ## %cond.load4
1421 ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
1422 ; SSE42-NEXT: testb $8, %al
1423 ; SSE42-NEXT: je LBB11_8
1424 ; SSE42-NEXT: LBB11_7: ## %cond.load7
1425 ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
1426 ; SSE42-NEXT: testb $16, %al
1427 ; SSE42-NEXT: je LBB11_10
1428 ; SSE42-NEXT: LBB11_9: ## %cond.load10
1429 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1430 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
1431 ; SSE42-NEXT: testb $32, %al
1432 ; SSE42-NEXT: je LBB11_12
1433 ; SSE42-NEXT: LBB11_11: ## %cond.load13
1434 ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
1435 ; SSE42-NEXT: testb $64, %al
1436 ; SSE42-NEXT: je LBB11_14
1437 ; SSE42-NEXT: LBB11_13: ## %cond.load16
1438 ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
1439 ; SSE42-NEXT: testb $-128, %al
1440 ; SSE42-NEXT: jne LBB11_15
1441 ; SSE42-NEXT: jmp LBB11_16
1443 ; AVX1-LABEL: load_v8f32_v8i32:
1445 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1446 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1447 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1448 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
1449 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1450 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
1451 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
1454 ; AVX2-LABEL: load_v8f32_v8i32:
1456 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1457 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
1458 ; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
1459 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
1462 ; AVX512F-LABEL: load_v8f32_v8i32:
1463 ; AVX512F: ## %bb.0:
1464 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1465 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1466 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
1467 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0
1468 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1
1469 ; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
1470 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1471 ; AVX512F-NEXT: retq
1473 ; AVX512VL-LABEL: load_v8f32_v8i32:
1474 ; AVX512VL: ## %bb.0:
1475 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
1476 ; AVX512VL-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
1477 ; AVX512VL-NEXT: retq
1479 ; X86-AVX512-LABEL: load_v8f32_v8i32:
1480 ; X86-AVX512: ## %bb.0:
1481 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1482 ; X86-AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k1
1483 ; X86-AVX512-NEXT: vblendmps (%eax), %ymm1, %ymm0 {%k1}
1484 ; X86-AVX512-NEXT: retl
1485 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
1486 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 32, <8 x i1> %mask, <8 x float> %dst)
1487 ret <8 x float> %res
1495 define <1 x i64> @load_v1i64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x i64> %dst) {
1496 ; SSE-LABEL: load_v1i64_v1i64:
1498 ; SSE-NEXT: testq %rdi, %rdi
1499 ; SSE-NEXT: jne LBB12_1
1500 ; SSE-NEXT: ## %bb.2: ## %cond.load
1501 ; SSE-NEXT: movq (%rsi), %rax
1503 ; SSE-NEXT: LBB12_1:
1504 ; SSE-NEXT: movq %rdx, %rax
1507 ; AVX-LABEL: load_v1i64_v1i64:
1509 ; AVX-NEXT: testq %rdi, %rdi
1510 ; AVX-NEXT: jne LBB12_1
1511 ; AVX-NEXT: ## %bb.2: ## %cond.load
1512 ; AVX-NEXT: movq (%rsi), %rax
1514 ; AVX-NEXT: LBB12_1:
1515 ; AVX-NEXT: movq %rdx, %rax
1518 ; X86-AVX512-LABEL: load_v1i64_v1i64:
1519 ; X86-AVX512: ## %bb.0:
1520 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1521 ; X86-AVX512-NEXT: orl {{[0-9]+}}(%esp), %eax
1522 ; X86-AVX512-NEXT: jne LBB12_1
1523 ; X86-AVX512-NEXT: ## %bb.2: ## %cond.load
1524 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1525 ; X86-AVX512-NEXT: movl (%ecx), %eax
1526 ; X86-AVX512-NEXT: movl 4(%ecx), %edx
1527 ; X86-AVX512-NEXT: retl
1528 ; X86-AVX512-NEXT: LBB12_1:
1529 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
1530 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1531 ; X86-AVX512-NEXT: retl
1532 %mask = icmp eq <1 x i64> %trigger, zeroinitializer
1533 %res = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %addr, i32 4, <1 x i1> %mask, <1 x i64> %dst)
1537 define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %dst) {
1538 ; SSE2-LABEL: load_v2i64_v2i64:
1540 ; SSE2-NEXT: pxor %xmm2, %xmm2
1541 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
1542 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
1543 ; SSE2-NEXT: pand %xmm2, %xmm0
1544 ; SSE2-NEXT: movmskpd %xmm0, %eax
1545 ; SSE2-NEXT: testb $1, %al
1546 ; SSE2-NEXT: jne LBB13_1
1547 ; SSE2-NEXT: ## %bb.2: ## %else
1548 ; SSE2-NEXT: testb $2, %al
1549 ; SSE2-NEXT: jne LBB13_3
1550 ; SSE2-NEXT: LBB13_4: ## %else2
1551 ; SSE2-NEXT: movaps %xmm1, %xmm0
1553 ; SSE2-NEXT: LBB13_1: ## %cond.load
1554 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
1555 ; SSE2-NEXT: testb $2, %al
1556 ; SSE2-NEXT: je LBB13_4
1557 ; SSE2-NEXT: LBB13_3: ## %cond.load1
1558 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1559 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1560 ; SSE2-NEXT: movaps %xmm1, %xmm0
1563 ; SSE42-LABEL: load_v2i64_v2i64:
1565 ; SSE42-NEXT: pxor %xmm2, %xmm2
1566 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2
1567 ; SSE42-NEXT: movmskpd %xmm2, %eax
1568 ; SSE42-NEXT: testb $1, %al
1569 ; SSE42-NEXT: jne LBB13_1
1570 ; SSE42-NEXT: ## %bb.2: ## %else
1571 ; SSE42-NEXT: testb $2, %al
1572 ; SSE42-NEXT: jne LBB13_3
1573 ; SSE42-NEXT: LBB13_4: ## %else2
1574 ; SSE42-NEXT: movdqa %xmm1, %xmm0
1576 ; SSE42-NEXT: LBB13_1: ## %cond.load
1577 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1
1578 ; SSE42-NEXT: testb $2, %al
1579 ; SSE42-NEXT: je LBB13_4
1580 ; SSE42-NEXT: LBB13_3: ## %cond.load1
1581 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1
1582 ; SSE42-NEXT: movdqa %xmm1, %xmm0
1585 ; AVX1-LABEL: load_v2i64_v2i64:
1587 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1588 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
1589 ; AVX1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
1590 ; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
1593 ; AVX2-LABEL: load_v2i64_v2i64:
1595 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1596 ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
1597 ; AVX2-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2
1598 ; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
1601 ; AVX512F-LABEL: load_v2i64_v2i64:
1602 ; AVX512F: ## %bb.0:
1603 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1604 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1605 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
1606 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
1607 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
1608 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1609 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
1610 ; AVX512F-NEXT: vzeroupper
1611 ; AVX512F-NEXT: retq
1613 ; AVX512VL-LABEL: load_v2i64_v2i64:
1614 ; AVX512VL: ## %bb.0:
1615 ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
1616 ; AVX512VL-NEXT: vpblendmq (%rdi), %xmm1, %xmm0 {%k1}
1617 ; AVX512VL-NEXT: retq
1619 ; X86-AVX512-LABEL: load_v2i64_v2i64:
1620 ; X86-AVX512: ## %bb.0:
1621 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1622 ; X86-AVX512-NEXT: vptestnmq %xmm0, %xmm0, %k1
1623 ; X86-AVX512-NEXT: vpblendmq (%eax), %xmm1, %xmm0 {%k1}
1624 ; X86-AVX512-NEXT: retl
1625 %mask = icmp eq <2 x i64> %trigger, zeroinitializer
1626 %res = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst)
1630 define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %dst) {
1631 ; SSE2-LABEL: load_v4i64_v4i64:
1633 ; SSE2-NEXT: pxor %xmm4, %xmm4
1634 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
1635 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
1636 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1637 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3]
1638 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1639 ; SSE2-NEXT: andps %xmm4, %xmm0
1640 ; SSE2-NEXT: movmskps %xmm0, %eax
1641 ; SSE2-NEXT: testb $1, %al
1642 ; SSE2-NEXT: jne LBB14_1
1643 ; SSE2-NEXT: ## %bb.2: ## %else
1644 ; SSE2-NEXT: testb $2, %al
1645 ; SSE2-NEXT: jne LBB14_3
1646 ; SSE2-NEXT: LBB14_4: ## %else2
1647 ; SSE2-NEXT: testb $4, %al
1648 ; SSE2-NEXT: jne LBB14_5
1649 ; SSE2-NEXT: LBB14_6: ## %else5
1650 ; SSE2-NEXT: testb $8, %al
1651 ; SSE2-NEXT: je LBB14_8
1652 ; SSE2-NEXT: LBB14_7: ## %cond.load7
1653 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1654 ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1655 ; SSE2-NEXT: LBB14_8: ## %else8
1656 ; SSE2-NEXT: movaps %xmm2, %xmm0
1657 ; SSE2-NEXT: movaps %xmm3, %xmm1
1659 ; SSE2-NEXT: LBB14_1: ## %cond.load
1660 ; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1661 ; SSE2-NEXT: testb $2, %al
1662 ; SSE2-NEXT: je LBB14_4
1663 ; SSE2-NEXT: LBB14_3: ## %cond.load1
1664 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1665 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1666 ; SSE2-NEXT: testb $4, %al
1667 ; SSE2-NEXT: je LBB14_6
1668 ; SSE2-NEXT: LBB14_5: ## %cond.load4
1669 ; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
1670 ; SSE2-NEXT: testb $8, %al
1671 ; SSE2-NEXT: jne LBB14_7
1672 ; SSE2-NEXT: jmp LBB14_8
1674 ; SSE42-LABEL: load_v4i64_v4i64:
1676 ; SSE42-NEXT: pxor %xmm4, %xmm4
1677 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm1
1678 ; SSE42-NEXT: pcmpeqq %xmm4, %xmm0
1679 ; SSE42-NEXT: packssdw %xmm1, %xmm0
1680 ; SSE42-NEXT: movmskps %xmm0, %eax
1681 ; SSE42-NEXT: testb $1, %al
1682 ; SSE42-NEXT: jne LBB14_1
1683 ; SSE42-NEXT: ## %bb.2: ## %else
1684 ; SSE42-NEXT: testb $2, %al
1685 ; SSE42-NEXT: jne LBB14_3
1686 ; SSE42-NEXT: LBB14_4: ## %else2
1687 ; SSE42-NEXT: testb $4, %al
1688 ; SSE42-NEXT: jne LBB14_5
1689 ; SSE42-NEXT: LBB14_6: ## %else5
1690 ; SSE42-NEXT: testb $8, %al
1691 ; SSE42-NEXT: je LBB14_8
1692 ; SSE42-NEXT: LBB14_7: ## %cond.load7
1693 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm3
1694 ; SSE42-NEXT: LBB14_8: ## %else8
1695 ; SSE42-NEXT: movdqa %xmm2, %xmm0
1696 ; SSE42-NEXT: movdqa %xmm3, %xmm1
1698 ; SSE42-NEXT: LBB14_1: ## %cond.load
1699 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm2
1700 ; SSE42-NEXT: testb $2, %al
1701 ; SSE42-NEXT: je LBB14_4
1702 ; SSE42-NEXT: LBB14_3: ## %cond.load1
1703 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm2
1704 ; SSE42-NEXT: testb $4, %al
1705 ; SSE42-NEXT: je LBB14_6
1706 ; SSE42-NEXT: LBB14_5: ## %cond.load4
1707 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm3
1708 ; SSE42-NEXT: testb $8, %al
1709 ; SSE42-NEXT: jne LBB14_7
1710 ; SSE42-NEXT: jmp LBB14_8
1712 ; AVX1-LABEL: load_v4i64_v4i64:
1714 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1715 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1716 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1717 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
1718 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1719 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
1720 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
1723 ; AVX2-LABEL: load_v4i64_v4i64:
1725 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1726 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0
1727 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2
1728 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
1731 ; AVX512F-LABEL: load_v4i64_v4i64:
1732 ; AVX512F: ## %bb.0:
1733 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1734 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1735 ; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
1736 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
1737 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1738 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1739 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
1740 ; AVX512F-NEXT: retq
1742 ; AVX512VL-LABEL: load_v4i64_v4i64:
1743 ; AVX512VL: ## %bb.0:
1744 ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
1745 ; AVX512VL-NEXT: vpblendmq (%rdi), %ymm1, %ymm0 {%k1}
1746 ; AVX512VL-NEXT: retq
1748 ; X86-AVX512-LABEL: load_v4i64_v4i64:
1749 ; X86-AVX512: ## %bb.0:
1750 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1751 ; X86-AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k1
1752 ; X86-AVX512-NEXT: vpblendmq (%eax), %ymm1, %ymm0 {%k1}
1753 ; X86-AVX512-NEXT: retl
1754 %mask = icmp eq <4 x i64> %trigger, zeroinitializer
1755 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst)
1759 define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i64> %dst) {
1760 ; SSE2-LABEL: load_v8i64_v8i16:
1762 ; SSE2-NEXT: pxor %xmm5, %xmm5
1763 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm5
1764 ; SSE2-NEXT: packsswb %xmm5, %xmm5
1765 ; SSE2-NEXT: pmovmskb %xmm5, %eax
1766 ; SSE2-NEXT: testb $1, %al
1767 ; SSE2-NEXT: jne LBB15_1
1768 ; SSE2-NEXT: ## %bb.2: ## %else
1769 ; SSE2-NEXT: testb $2, %al
1770 ; SSE2-NEXT: jne LBB15_3
1771 ; SSE2-NEXT: LBB15_4: ## %else2
1772 ; SSE2-NEXT: testb $4, %al
1773 ; SSE2-NEXT: jne LBB15_5
1774 ; SSE2-NEXT: LBB15_6: ## %else5
1775 ; SSE2-NEXT: testb $8, %al
1776 ; SSE2-NEXT: jne LBB15_7
1777 ; SSE2-NEXT: LBB15_8: ## %else8
1778 ; SSE2-NEXT: testb $16, %al
1779 ; SSE2-NEXT: jne LBB15_9
1780 ; SSE2-NEXT: LBB15_10: ## %else11
1781 ; SSE2-NEXT: testb $32, %al
1782 ; SSE2-NEXT: jne LBB15_11
1783 ; SSE2-NEXT: LBB15_12: ## %else14
1784 ; SSE2-NEXT: testb $64, %al
1785 ; SSE2-NEXT: jne LBB15_13
1786 ; SSE2-NEXT: LBB15_14: ## %else17
1787 ; SSE2-NEXT: testb $-128, %al
1788 ; SSE2-NEXT: je LBB15_16
1789 ; SSE2-NEXT: LBB15_15: ## %cond.load19
1790 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1791 ; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
1792 ; SSE2-NEXT: LBB15_16: ## %else20
1793 ; SSE2-NEXT: movaps %xmm1, %xmm0
1794 ; SSE2-NEXT: movaps %xmm2, %xmm1
1795 ; SSE2-NEXT: movaps %xmm3, %xmm2
1796 ; SSE2-NEXT: movaps %xmm4, %xmm3
1798 ; SSE2-NEXT: LBB15_1: ## %cond.load
1799 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
1800 ; SSE2-NEXT: testb $2, %al
1801 ; SSE2-NEXT: je LBB15_4
1802 ; SSE2-NEXT: LBB15_3: ## %cond.load1
1803 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1804 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1805 ; SSE2-NEXT: testb $4, %al
1806 ; SSE2-NEXT: je LBB15_6
1807 ; SSE2-NEXT: LBB15_5: ## %cond.load4
1808 ; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1809 ; SSE2-NEXT: testb $8, %al
1810 ; SSE2-NEXT: je LBB15_8
1811 ; SSE2-NEXT: LBB15_7: ## %cond.load7
1812 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1813 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1814 ; SSE2-NEXT: testb $16, %al
1815 ; SSE2-NEXT: je LBB15_10
1816 ; SSE2-NEXT: LBB15_9: ## %cond.load10
1817 ; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
1818 ; SSE2-NEXT: testb $32, %al
1819 ; SSE2-NEXT: je LBB15_12
1820 ; SSE2-NEXT: LBB15_11: ## %cond.load13
1821 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1822 ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1823 ; SSE2-NEXT: testb $64, %al
1824 ; SSE2-NEXT: je LBB15_14
1825 ; SSE2-NEXT: LBB15_13: ## %cond.load16
1826 ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
1827 ; SSE2-NEXT: testb $-128, %al
1828 ; SSE2-NEXT: jne LBB15_15
1829 ; SSE2-NEXT: jmp LBB15_16
1831 ; SSE42-LABEL: load_v8i64_v8i16:
1833 ; SSE42-NEXT: pxor %xmm5, %xmm5
1834 ; SSE42-NEXT: pcmpeqw %xmm0, %xmm5
1835 ; SSE42-NEXT: packsswb %xmm5, %xmm5
1836 ; SSE42-NEXT: pmovmskb %xmm5, %eax
1837 ; SSE42-NEXT: testb $1, %al
1838 ; SSE42-NEXT: jne LBB15_1
1839 ; SSE42-NEXT: ## %bb.2: ## %else
1840 ; SSE42-NEXT: testb $2, %al
1841 ; SSE42-NEXT: jne LBB15_3
1842 ; SSE42-NEXT: LBB15_4: ## %else2
1843 ; SSE42-NEXT: testb $4, %al
1844 ; SSE42-NEXT: jne LBB15_5
1845 ; SSE42-NEXT: LBB15_6: ## %else5
1846 ; SSE42-NEXT: testb $8, %al
1847 ; SSE42-NEXT: jne LBB15_7
1848 ; SSE42-NEXT: LBB15_8: ## %else8
1849 ; SSE42-NEXT: testb $16, %al
1850 ; SSE42-NEXT: jne LBB15_9
1851 ; SSE42-NEXT: LBB15_10: ## %else11
1852 ; SSE42-NEXT: testb $32, %al
1853 ; SSE42-NEXT: jne LBB15_11
1854 ; SSE42-NEXT: LBB15_12: ## %else14
1855 ; SSE42-NEXT: testb $64, %al
1856 ; SSE42-NEXT: jne LBB15_13
1857 ; SSE42-NEXT: LBB15_14: ## %else17
1858 ; SSE42-NEXT: testb $-128, %al
1859 ; SSE42-NEXT: je LBB15_16
1860 ; SSE42-NEXT: LBB15_15: ## %cond.load19
1861 ; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm4
1862 ; SSE42-NEXT: LBB15_16: ## %else20
1863 ; SSE42-NEXT: movdqa %xmm1, %xmm0
1864 ; SSE42-NEXT: movdqa %xmm2, %xmm1
1865 ; SSE42-NEXT: movdqa %xmm3, %xmm2
1866 ; SSE42-NEXT: movdqa %xmm4, %xmm3
1868 ; SSE42-NEXT: LBB15_1: ## %cond.load
1869 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm1
1870 ; SSE42-NEXT: testb $2, %al
1871 ; SSE42-NEXT: je LBB15_4
1872 ; SSE42-NEXT: LBB15_3: ## %cond.load1
1873 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm1
1874 ; SSE42-NEXT: testb $4, %al
1875 ; SSE42-NEXT: je LBB15_6
1876 ; SSE42-NEXT: LBB15_5: ## %cond.load4
1877 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm2
1878 ; SSE42-NEXT: testb $8, %al
1879 ; SSE42-NEXT: je LBB15_8
1880 ; SSE42-NEXT: LBB15_7: ## %cond.load7
1881 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm2
1882 ; SSE42-NEXT: testb $16, %al
1883 ; SSE42-NEXT: je LBB15_10
1884 ; SSE42-NEXT: LBB15_9: ## %cond.load10
1885 ; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm3
1886 ; SSE42-NEXT: testb $32, %al
1887 ; SSE42-NEXT: je LBB15_12
1888 ; SSE42-NEXT: LBB15_11: ## %cond.load13
1889 ; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm3
1890 ; SSE42-NEXT: testb $64, %al
1891 ; SSE42-NEXT: je LBB15_14
1892 ; SSE42-NEXT: LBB15_13: ## %cond.load16
1893 ; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm4
1894 ; SSE42-NEXT: testb $-128, %al
1895 ; SSE42-NEXT: jne LBB15_15
1896 ; SSE42-NEXT: jmp LBB15_16
1898 ; AVX1-LABEL: load_v8i64_v8i16:
1900 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1901 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
1902 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
1903 ; AVX1-NEXT: vpmovsxwq %xmm3, %xmm5
1904 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
1905 ; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3
1906 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
1907 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
1908 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
1909 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1910 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
1911 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
1912 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
1913 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
1914 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
1915 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1918 ; AVX2-LABEL: load_v8i64_v8i16:
1920 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1921 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1922 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
1923 ; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3
1924 ; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
1925 ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
1926 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
1927 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
1928 ; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1
1929 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
1932 ; AVX512F-LABEL: load_v8i64_v8i16:
1933 ; AVX512F: ## %bb.0:
1934 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1935 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
1936 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
1937 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
1938 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1939 ; AVX512F-NEXT: retq
1941 ; AVX512VLDQ-LABEL: load_v8i64_v8i16:
1942 ; AVX512VLDQ: ## %bb.0:
1943 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
1944 ; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
1945 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
1946 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
1947 ; AVX512VLDQ-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1948 ; AVX512VLDQ-NEXT: retq
1950 ; AVX512VLBW-LABEL: load_v8i64_v8i16:
1951 ; AVX512VLBW: ## %bb.0:
1952 ; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
1953 ; AVX512VLBW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
1954 ; AVX512VLBW-NEXT: retq
1956 ; X86-AVX512-LABEL: load_v8i64_v8i16:
1957 ; X86-AVX512: ## %bb.0:
1958 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1959 ; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1
1960 ; X86-AVX512-NEXT: vpblendmq (%eax), %zmm1, %zmm0 {%k1}
1961 ; X86-AVX512-NEXT: retl
1962 %mask = icmp eq <8 x i16> %trigger, zeroinitializer
1963 %res = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
1967 define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst) {
1968 ; SSE2-LABEL: load_v8i64_v8i64:
1970 ; SSE2-NEXT: pxor %xmm8, %xmm8
1971 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
1972 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2]
1973 ; SSE2-NEXT: pand %xmm3, %xmm9
1974 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
1975 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
1976 ; SSE2-NEXT: pand %xmm2, %xmm3
1977 ; SSE2-NEXT: packssdw %xmm9, %xmm3
1978 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
1979 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
1980 ; SSE2-NEXT: pand %xmm1, %xmm2
1981 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
1982 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
1983 ; SSE2-NEXT: pand %xmm0, %xmm1
1984 ; SSE2-NEXT: packssdw %xmm2, %xmm1
1985 ; SSE2-NEXT: packssdw %xmm3, %xmm1
1986 ; SSE2-NEXT: packsswb %xmm1, %xmm1
1987 ; SSE2-NEXT: pmovmskb %xmm1, %eax
1988 ; SSE2-NEXT: testb $1, %al
1989 ; SSE2-NEXT: jne LBB16_1
1990 ; SSE2-NEXT: ## %bb.2: ## %else
1991 ; SSE2-NEXT: testb $2, %al
1992 ; SSE2-NEXT: jne LBB16_3
1993 ; SSE2-NEXT: LBB16_4: ## %else2
1994 ; SSE2-NEXT: testb $4, %al
1995 ; SSE2-NEXT: jne LBB16_5
1996 ; SSE2-NEXT: LBB16_6: ## %else5
1997 ; SSE2-NEXT: testb $8, %al
1998 ; SSE2-NEXT: jne LBB16_7
1999 ; SSE2-NEXT: LBB16_8: ## %else8
2000 ; SSE2-NEXT: testb $16, %al
2001 ; SSE2-NEXT: jne LBB16_9
2002 ; SSE2-NEXT: LBB16_10: ## %else11
2003 ; SSE2-NEXT: testb $32, %al
2004 ; SSE2-NEXT: jne LBB16_11
2005 ; SSE2-NEXT: LBB16_12: ## %else14
2006 ; SSE2-NEXT: testb $64, %al
2007 ; SSE2-NEXT: jne LBB16_13
2008 ; SSE2-NEXT: LBB16_14: ## %else17
2009 ; SSE2-NEXT: testb $-128, %al
2010 ; SSE2-NEXT: je LBB16_16
2011 ; SSE2-NEXT: LBB16_15: ## %cond.load19
2012 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2013 ; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
2014 ; SSE2-NEXT: LBB16_16: ## %else20
2015 ; SSE2-NEXT: movaps %xmm4, %xmm0
2016 ; SSE2-NEXT: movaps %xmm5, %xmm1
2017 ; SSE2-NEXT: movaps %xmm6, %xmm2
2018 ; SSE2-NEXT: movaps %xmm7, %xmm3
2020 ; SSE2-NEXT: LBB16_1: ## %cond.load
2021 ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
2022 ; SSE2-NEXT: testb $2, %al
2023 ; SSE2-NEXT: je LBB16_4
2024 ; SSE2-NEXT: LBB16_3: ## %cond.load1
2025 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2026 ; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
2027 ; SSE2-NEXT: testb $4, %al
2028 ; SSE2-NEXT: je LBB16_6
2029 ; SSE2-NEXT: LBB16_5: ## %cond.load4
2030 ; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
2031 ; SSE2-NEXT: testb $8, %al
2032 ; SSE2-NEXT: je LBB16_8
2033 ; SSE2-NEXT: LBB16_7: ## %cond.load7
2034 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2035 ; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
2036 ; SSE2-NEXT: testb $16, %al
2037 ; SSE2-NEXT: je LBB16_10
2038 ; SSE2-NEXT: LBB16_9: ## %cond.load10
2039 ; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
2040 ; SSE2-NEXT: testb $32, %al
2041 ; SSE2-NEXT: je LBB16_12
2042 ; SSE2-NEXT: LBB16_11: ## %cond.load13
2043 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
2044 ; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
2045 ; SSE2-NEXT: testb $64, %al
2046 ; SSE2-NEXT: je LBB16_14
2047 ; SSE2-NEXT: LBB16_13: ## %cond.load16
2048 ; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
2049 ; SSE2-NEXT: testb $-128, %al
2050 ; SSE2-NEXT: jne LBB16_15
2051 ; SSE2-NEXT: jmp LBB16_16
2053 ; SSE42-LABEL: load_v8i64_v8i64:
2055 ; SSE42-NEXT: pxor %xmm8, %xmm8
2056 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm3
2057 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm2
2058 ; SSE42-NEXT: packssdw %xmm3, %xmm2
2059 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm1
2060 ; SSE42-NEXT: pcmpeqq %xmm8, %xmm0
2061 ; SSE42-NEXT: packssdw %xmm1, %xmm0
2062 ; SSE42-NEXT: packssdw %xmm2, %xmm0
2063 ; SSE42-NEXT: packsswb %xmm0, %xmm0
2064 ; SSE42-NEXT: pmovmskb %xmm0, %eax
2065 ; SSE42-NEXT: testb $1, %al
2066 ; SSE42-NEXT: jne LBB16_1
2067 ; SSE42-NEXT: ## %bb.2: ## %else
2068 ; SSE42-NEXT: testb $2, %al
2069 ; SSE42-NEXT: jne LBB16_3
2070 ; SSE42-NEXT: LBB16_4: ## %else2
2071 ; SSE42-NEXT: testb $4, %al
2072 ; SSE42-NEXT: jne LBB16_5
2073 ; SSE42-NEXT: LBB16_6: ## %else5
2074 ; SSE42-NEXT: testb $8, %al
2075 ; SSE42-NEXT: jne LBB16_7
2076 ; SSE42-NEXT: LBB16_8: ## %else8
2077 ; SSE42-NEXT: testb $16, %al
2078 ; SSE42-NEXT: jne LBB16_9
2079 ; SSE42-NEXT: LBB16_10: ## %else11
2080 ; SSE42-NEXT: testb $32, %al
2081 ; SSE42-NEXT: jne LBB16_11
2082 ; SSE42-NEXT: LBB16_12: ## %else14
2083 ; SSE42-NEXT: testb $64, %al
2084 ; SSE42-NEXT: jne LBB16_13
2085 ; SSE42-NEXT: LBB16_14: ## %else17
2086 ; SSE42-NEXT: testb $-128, %al
2087 ; SSE42-NEXT: je LBB16_16
2088 ; SSE42-NEXT: LBB16_15: ## %cond.load19
2089 ; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm7
2090 ; SSE42-NEXT: LBB16_16: ## %else20
2091 ; SSE42-NEXT: movdqa %xmm4, %xmm0
2092 ; SSE42-NEXT: movdqa %xmm5, %xmm1
2093 ; SSE42-NEXT: movdqa %xmm6, %xmm2
2094 ; SSE42-NEXT: movdqa %xmm7, %xmm3
2096 ; SSE42-NEXT: LBB16_1: ## %cond.load
2097 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4
2098 ; SSE42-NEXT: testb $2, %al
2099 ; SSE42-NEXT: je LBB16_4
2100 ; SSE42-NEXT: LBB16_3: ## %cond.load1
2101 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm4
2102 ; SSE42-NEXT: testb $4, %al
2103 ; SSE42-NEXT: je LBB16_6
2104 ; SSE42-NEXT: LBB16_5: ## %cond.load4
2105 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm5
2106 ; SSE42-NEXT: testb $8, %al
2107 ; SSE42-NEXT: je LBB16_8
2108 ; SSE42-NEXT: LBB16_7: ## %cond.load7
2109 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm5
2110 ; SSE42-NEXT: testb $16, %al
2111 ; SSE42-NEXT: je LBB16_10
2112 ; SSE42-NEXT: LBB16_9: ## %cond.load10
2113 ; SSE42-NEXT: pinsrq $0, 32(%rdi), %xmm6
2114 ; SSE42-NEXT: testb $32, %al
2115 ; SSE42-NEXT: je LBB16_12
2116 ; SSE42-NEXT: LBB16_11: ## %cond.load13
2117 ; SSE42-NEXT: pinsrq $1, 40(%rdi), %xmm6
2118 ; SSE42-NEXT: testb $64, %al
2119 ; SSE42-NEXT: je LBB16_14
2120 ; SSE42-NEXT: LBB16_13: ## %cond.load16
2121 ; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm7
2122 ; SSE42-NEXT: testb $-128, %al
2123 ; SSE42-NEXT: jne LBB16_15
2124 ; SSE42-NEXT: jmp LBB16_16
2126 ; AVX1-LABEL: load_v8i64_v8i64:
2128 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
2129 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
2130 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
2131 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
2132 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
2133 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
2134 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
2135 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
2136 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
2137 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
2138 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
2139 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2
2140 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
2143 ; AVX2-LABEL: load_v8i64_v8i64:
2145 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
2146 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
2147 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
2148 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
2149 ; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0
2150 ; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm2
2151 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
2154 ; AVX512-LABEL: load_v8i64_v8i64:
2156 ; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
2157 ; AVX512-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
2160 ; X86-AVX512-LABEL: load_v8i64_v8i64:
2161 ; X86-AVX512: ## %bb.0:
2162 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
2163 ; X86-AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
2164 ; X86-AVX512-NEXT: vpblendmq (%eax), %zmm1, %zmm0 {%k1}
2165 ; X86-AVX512-NEXT: retl
2166 %mask = icmp eq <8 x i64> %trigger, zeroinitializer
2167 %res = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
2175 define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
2176 ; SSE2-LABEL: load_v2i32_v2i32:
2178 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2179 ; SSE2-NEXT: pxor %xmm2, %xmm2
2180 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
2181 ; SSE2-NEXT: movmskpd %xmm2, %eax
2182 ; SSE2-NEXT: testb $1, %al
2183 ; SSE2-NEXT: jne LBB17_1
2184 ; SSE2-NEXT: ## %bb.2: ## %else
2185 ; SSE2-NEXT: testb $2, %al
2186 ; SSE2-NEXT: jne LBB17_3
2187 ; SSE2-NEXT: LBB17_4: ## %else2
2188 ; SSE2-NEXT: movaps %xmm1, %xmm0
2190 ; SSE2-NEXT: LBB17_1: ## %cond.load
2191 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2192 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2193 ; SSE2-NEXT: testb $2, %al
2194 ; SSE2-NEXT: je LBB17_4
2195 ; SSE2-NEXT: LBB17_3: ## %cond.load1
2196 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2197 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2198 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2199 ; SSE2-NEXT: movaps %xmm0, %xmm1
2200 ; SSE2-NEXT: movaps %xmm1, %xmm0
2203 ; SSE42-LABEL: load_v2i32_v2i32:
2205 ; SSE42-NEXT: pxor %xmm2, %xmm2
2206 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
2207 ; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
2208 ; SSE42-NEXT: movmskpd %xmm0, %eax
2209 ; SSE42-NEXT: testb $1, %al
2210 ; SSE42-NEXT: jne LBB17_1
2211 ; SSE42-NEXT: ## %bb.2: ## %else
2212 ; SSE42-NEXT: testb $2, %al
2213 ; SSE42-NEXT: jne LBB17_3
2214 ; SSE42-NEXT: LBB17_4: ## %else2
2215 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2217 ; SSE42-NEXT: LBB17_1: ## %cond.load
2218 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
2219 ; SSE42-NEXT: testb $2, %al
2220 ; SSE42-NEXT: je LBB17_4
2221 ; SSE42-NEXT: LBB17_3: ## %cond.load1
2222 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
2223 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2226 ; AVX1-LABEL: load_v2i32_v2i32:
2228 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2229 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2230 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2231 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
2232 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2235 ; AVX2-LABEL: load_v2i32_v2i32:
2237 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2238 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2239 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2240 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
2241 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2244 ; AVX512F-LABEL: load_v2i32_v2i32:
2245 ; AVX512F: ## %bb.0:
2246 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2247 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2248 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
2249 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
2250 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
2251 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2252 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
2253 ; AVX512F-NEXT: vzeroupper
2254 ; AVX512F-NEXT: retq
2256 ; AVX512VLDQ-LABEL: load_v2i32_v2i32:
2257 ; AVX512VLDQ: ## %bb.0:
2258 ; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
2259 ; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
2260 ; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
2261 ; AVX512VLDQ-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2262 ; AVX512VLDQ-NEXT: retq
2264 ; AVX512VLBW-LABEL: load_v2i32_v2i32:
2265 ; AVX512VLBW: ## %bb.0:
2266 ; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
2267 ; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
2268 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
2269 ; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2270 ; AVX512VLBW-NEXT: retq
2272 ; X86-AVX512-LABEL: load_v2i32_v2i32:
2273 ; X86-AVX512: ## %bb.0:
2274 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
2275 ; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
2276 ; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
2277 ; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
2278 ; X86-AVX512-NEXT: vpblendmd (%eax), %xmm1, %xmm0 {%k1}
2279 ; X86-AVX512-NEXT: retl
2280 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
2281 %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
2285 define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
2286 ; SSE2-LABEL: load_v4i32_v4i32:
2288 ; SSE2-NEXT: pxor %xmm2, %xmm2
2289 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
2290 ; SSE2-NEXT: movmskps %xmm2, %eax
2291 ; SSE2-NEXT: testb $1, %al
2292 ; SSE2-NEXT: jne LBB18_1
2293 ; SSE2-NEXT: ## %bb.2: ## %else
2294 ; SSE2-NEXT: testb $2, %al
2295 ; SSE2-NEXT: jne LBB18_3
2296 ; SSE2-NEXT: LBB18_4: ## %else2
2297 ; SSE2-NEXT: testb $4, %al
2298 ; SSE2-NEXT: jne LBB18_5
2299 ; SSE2-NEXT: LBB18_6: ## %else5
2300 ; SSE2-NEXT: testb $8, %al
2301 ; SSE2-NEXT: jne LBB18_7
2302 ; SSE2-NEXT: LBB18_8: ## %else8
2303 ; SSE2-NEXT: movaps %xmm1, %xmm0
2305 ; SSE2-NEXT: LBB18_1: ## %cond.load
2306 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2307 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2308 ; SSE2-NEXT: testb $2, %al
2309 ; SSE2-NEXT: je LBB18_4
2310 ; SSE2-NEXT: LBB18_3: ## %cond.load1
2311 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2312 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2313 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2314 ; SSE2-NEXT: movaps %xmm0, %xmm1
2315 ; SSE2-NEXT: testb $4, %al
2316 ; SSE2-NEXT: je LBB18_6
2317 ; SSE2-NEXT: LBB18_5: ## %cond.load4
2318 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2319 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
2320 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
2321 ; SSE2-NEXT: testb $8, %al
2322 ; SSE2-NEXT: je LBB18_8
2323 ; SSE2-NEXT: LBB18_7: ## %cond.load7
2324 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2325 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2326 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2327 ; SSE2-NEXT: movaps %xmm1, %xmm0
2330 ; SSE42-LABEL: load_v4i32_v4i32:
2332 ; SSE42-NEXT: pxor %xmm2, %xmm2
2333 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
2334 ; SSE42-NEXT: movmskps %xmm2, %eax
2335 ; SSE42-NEXT: testb $1, %al
2336 ; SSE42-NEXT: jne LBB18_1
2337 ; SSE42-NEXT: ## %bb.2: ## %else
2338 ; SSE42-NEXT: testb $2, %al
2339 ; SSE42-NEXT: jne LBB18_3
2340 ; SSE42-NEXT: LBB18_4: ## %else2
2341 ; SSE42-NEXT: testb $4, %al
2342 ; SSE42-NEXT: jne LBB18_5
2343 ; SSE42-NEXT: LBB18_6: ## %else5
2344 ; SSE42-NEXT: testb $8, %al
2345 ; SSE42-NEXT: jne LBB18_7
2346 ; SSE42-NEXT: LBB18_8: ## %else8
2347 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2349 ; SSE42-NEXT: LBB18_1: ## %cond.load
2350 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
2351 ; SSE42-NEXT: testb $2, %al
2352 ; SSE42-NEXT: je LBB18_4
2353 ; SSE42-NEXT: LBB18_3: ## %cond.load1
2354 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
2355 ; SSE42-NEXT: testb $4, %al
2356 ; SSE42-NEXT: je LBB18_6
2357 ; SSE42-NEXT: LBB18_5: ## %cond.load4
2358 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1
2359 ; SSE42-NEXT: testb $8, %al
2360 ; SSE42-NEXT: je LBB18_8
2361 ; SSE42-NEXT: LBB18_7: ## %cond.load7
2362 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1
2363 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2366 ; AVX1-LABEL: load_v4i32_v4i32:
2368 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2369 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2370 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
2371 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2374 ; AVX2-LABEL: load_v4i32_v4i32:
2376 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2377 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
2378 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
2379 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2382 ; AVX512F-LABEL: load_v4i32_v4i32:
2383 ; AVX512F: ## %bb.0:
2384 ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
2385 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
2386 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
2387 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
2388 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
2389 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2390 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
2391 ; AVX512F-NEXT: vzeroupper
2392 ; AVX512F-NEXT: retq
2394 ; AVX512VL-LABEL: load_v4i32_v4i32:
2395 ; AVX512VL: ## %bb.0:
2396 ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
2397 ; AVX512VL-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
2398 ; AVX512VL-NEXT: retq
2400 ; X86-AVX512-LABEL: load_v4i32_v4i32:
2401 ; X86-AVX512: ## %bb.0:
2402 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
2403 ; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
2404 ; X86-AVX512-NEXT: vpblendmd (%eax), %xmm1, %xmm0 {%k1}
2405 ; X86-AVX512-NEXT: retl
2406 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
2407 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
2411 define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, ptr %addr, <8 x i32> %dst) {
2412 ; SSE2-LABEL: load_v8i32_v8i1:
2414 ; SSE2-NEXT: psllw $15, %xmm0
2415 ; SSE2-NEXT: packsswb %xmm0, %xmm0
2416 ; SSE2-NEXT: pmovmskb %xmm0, %eax
2417 ; SSE2-NEXT: testb $1, %al
2418 ; SSE2-NEXT: jne LBB19_1
2419 ; SSE2-NEXT: ## %bb.2: ## %else
2420 ; SSE2-NEXT: testb $2, %al
2421 ; SSE2-NEXT: jne LBB19_3
2422 ; SSE2-NEXT: LBB19_4: ## %else2
2423 ; SSE2-NEXT: testb $4, %al
2424 ; SSE2-NEXT: jne LBB19_5
2425 ; SSE2-NEXT: LBB19_6: ## %else5
2426 ; SSE2-NEXT: testb $8, %al
2427 ; SSE2-NEXT: jne LBB19_7
2428 ; SSE2-NEXT: LBB19_8: ## %else8
2429 ; SSE2-NEXT: testb $16, %al
2430 ; SSE2-NEXT: jne LBB19_9
2431 ; SSE2-NEXT: LBB19_10: ## %else11
2432 ; SSE2-NEXT: testb $32, %al
2433 ; SSE2-NEXT: jne LBB19_11
2434 ; SSE2-NEXT: LBB19_12: ## %else14
2435 ; SSE2-NEXT: testb $64, %al
2436 ; SSE2-NEXT: jne LBB19_13
2437 ; SSE2-NEXT: LBB19_14: ## %else17
2438 ; SSE2-NEXT: testb $-128, %al
2439 ; SSE2-NEXT: je LBB19_16
2440 ; SSE2-NEXT: LBB19_15: ## %cond.load19
2441 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2442 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2443 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
2444 ; SSE2-NEXT: LBB19_16: ## %else20
2445 ; SSE2-NEXT: movaps %xmm1, %xmm0
2446 ; SSE2-NEXT: movaps %xmm2, %xmm1
2448 ; SSE2-NEXT: LBB19_1: ## %cond.load
2449 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2450 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2451 ; SSE2-NEXT: testb $2, %al
2452 ; SSE2-NEXT: je LBB19_4
2453 ; SSE2-NEXT: LBB19_3: ## %cond.load1
2454 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2455 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2456 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2457 ; SSE2-NEXT: movaps %xmm0, %xmm1
2458 ; SSE2-NEXT: testb $4, %al
2459 ; SSE2-NEXT: je LBB19_6
2460 ; SSE2-NEXT: LBB19_5: ## %cond.load4
2461 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2462 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
2463 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
2464 ; SSE2-NEXT: testb $8, %al
2465 ; SSE2-NEXT: je LBB19_8
2466 ; SSE2-NEXT: LBB19_7: ## %cond.load7
2467 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2468 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2469 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2470 ; SSE2-NEXT: testb $16, %al
2471 ; SSE2-NEXT: je LBB19_10
2472 ; SSE2-NEXT: LBB19_9: ## %cond.load10
2473 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2474 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2475 ; SSE2-NEXT: testb $32, %al
2476 ; SSE2-NEXT: je LBB19_12
2477 ; SSE2-NEXT: LBB19_11: ## %cond.load13
2478 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2479 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2480 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
2481 ; SSE2-NEXT: movaps %xmm0, %xmm2
2482 ; SSE2-NEXT: testb $64, %al
2483 ; SSE2-NEXT: je LBB19_14
2484 ; SSE2-NEXT: LBB19_13: ## %cond.load16
2485 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2486 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[3,0]
2487 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2]
2488 ; SSE2-NEXT: testb $-128, %al
2489 ; SSE2-NEXT: jne LBB19_15
2490 ; SSE2-NEXT: jmp LBB19_16
2492 ; SSE42-LABEL: load_v8i32_v8i1:
2494 ; SSE42-NEXT: psllw $15, %xmm0
2495 ; SSE42-NEXT: packsswb %xmm0, %xmm0
2496 ; SSE42-NEXT: pmovmskb %xmm0, %eax
2497 ; SSE42-NEXT: testb $1, %al
2498 ; SSE42-NEXT: jne LBB19_1
2499 ; SSE42-NEXT: ## %bb.2: ## %else
2500 ; SSE42-NEXT: testb $2, %al
2501 ; SSE42-NEXT: jne LBB19_3
2502 ; SSE42-NEXT: LBB19_4: ## %else2
2503 ; SSE42-NEXT: testb $4, %al
2504 ; SSE42-NEXT: jne LBB19_5
2505 ; SSE42-NEXT: LBB19_6: ## %else5
2506 ; SSE42-NEXT: testb $8, %al
2507 ; SSE42-NEXT: jne LBB19_7
2508 ; SSE42-NEXT: LBB19_8: ## %else8
2509 ; SSE42-NEXT: testb $16, %al
2510 ; SSE42-NEXT: jne LBB19_9
2511 ; SSE42-NEXT: LBB19_10: ## %else11
2512 ; SSE42-NEXT: testb $32, %al
2513 ; SSE42-NEXT: jne LBB19_11
2514 ; SSE42-NEXT: LBB19_12: ## %else14
2515 ; SSE42-NEXT: testb $64, %al
2516 ; SSE42-NEXT: jne LBB19_13
2517 ; SSE42-NEXT: LBB19_14: ## %else17
2518 ; SSE42-NEXT: testb $-128, %al
2519 ; SSE42-NEXT: je LBB19_16
2520 ; SSE42-NEXT: LBB19_15: ## %cond.load19
2521 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm2
2522 ; SSE42-NEXT: LBB19_16: ## %else20
2523 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2524 ; SSE42-NEXT: movdqa %xmm2, %xmm1
2526 ; SSE42-NEXT: LBB19_1: ## %cond.load
2527 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
2528 ; SSE42-NEXT: testb $2, %al
2529 ; SSE42-NEXT: je LBB19_4
2530 ; SSE42-NEXT: LBB19_3: ## %cond.load1
2531 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
2532 ; SSE42-NEXT: testb $4, %al
2533 ; SSE42-NEXT: je LBB19_6
2534 ; SSE42-NEXT: LBB19_5: ## %cond.load4
2535 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1
2536 ; SSE42-NEXT: testb $8, %al
2537 ; SSE42-NEXT: je LBB19_8
2538 ; SSE42-NEXT: LBB19_7: ## %cond.load7
2539 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1
2540 ; SSE42-NEXT: testb $16, %al
2541 ; SSE42-NEXT: je LBB19_10
2542 ; SSE42-NEXT: LBB19_9: ## %cond.load10
2543 ; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm2
2544 ; SSE42-NEXT: testb $32, %al
2545 ; SSE42-NEXT: je LBB19_12
2546 ; SSE42-NEXT: LBB19_11: ## %cond.load13
2547 ; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm2
2548 ; SSE42-NEXT: testb $64, %al
2549 ; SSE42-NEXT: je LBB19_14
2550 ; SSE42-NEXT: LBB19_13: ## %cond.load16
2551 ; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm2
2552 ; SSE42-NEXT: testb $-128, %al
2553 ; SSE42-NEXT: jne LBB19_15
2554 ; SSE42-NEXT: jmp LBB19_16
2556 ; AVX1-LABEL: load_v8i32_v8i1:
2558 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2559 ; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
2560 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2561 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
2562 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2563 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
2564 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
2567 ; AVX2-LABEL: load_v8i32_v8i1:
2569 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2570 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
2571 ; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2
2572 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
2575 ; AVX512F-LABEL: load_v8i32_v8i1:
2576 ; AVX512F: ## %bb.0:
2577 ; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
2578 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
2579 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
2580 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
2581 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
2582 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2583 ; AVX512F-NEXT: retq
2585 ; AVX512VLDQ-LABEL: load_v8i32_v8i1:
2586 ; AVX512VLDQ: ## %bb.0:
2587 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
2588 ; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
2589 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
2590 ; AVX512VLDQ-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
2591 ; AVX512VLDQ-NEXT: retq
2593 ; AVX512VLBW-LABEL: load_v8i32_v8i1:
2594 ; AVX512VLBW: ## %bb.0:
2595 ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
2596 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
2597 ; AVX512VLBW-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
2598 ; AVX512VLBW-NEXT: retq
2600 ; X86-AVX512-LABEL: load_v8i32_v8i1:
2601 ; X86-AVX512: ## %bb.0:
2602 ; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0
2603 ; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
2604 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
2605 ; X86-AVX512-NEXT: vpblendmd (%eax), %ymm1, %ymm0 {%k1}
2606 ; X86-AVX512-NEXT: retl
2607 %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i32> %dst)
2611 define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
2612 ; SSE2-LABEL: load_v8i32_v8i1_zero:
2614 ; SSE2-NEXT: psllw $15, %xmm0
2615 ; SSE2-NEXT: packsswb %xmm0, %xmm0
2616 ; SSE2-NEXT: pmovmskb %xmm0, %eax
2617 ; SSE2-NEXT: pxor %xmm0, %xmm0
2618 ; SSE2-NEXT: testb $1, %al
2619 ; SSE2-NEXT: xorps %xmm1, %xmm1
2620 ; SSE2-NEXT: jne LBB20_1
2621 ; SSE2-NEXT: ## %bb.2: ## %else
2622 ; SSE2-NEXT: testb $2, %al
2623 ; SSE2-NEXT: jne LBB20_3
2624 ; SSE2-NEXT: LBB20_4: ## %else2
2625 ; SSE2-NEXT: testb $4, %al
2626 ; SSE2-NEXT: jne LBB20_5
2627 ; SSE2-NEXT: LBB20_6: ## %else5
2628 ; SSE2-NEXT: testb $8, %al
2629 ; SSE2-NEXT: jne LBB20_7
2630 ; SSE2-NEXT: LBB20_8: ## %else8
2631 ; SSE2-NEXT: testb $16, %al
2632 ; SSE2-NEXT: jne LBB20_9
2633 ; SSE2-NEXT: LBB20_10: ## %else11
2634 ; SSE2-NEXT: testb $32, %al
2635 ; SSE2-NEXT: jne LBB20_11
2636 ; SSE2-NEXT: LBB20_12: ## %else14
2637 ; SSE2-NEXT: testb $64, %al
2638 ; SSE2-NEXT: jne LBB20_13
2639 ; SSE2-NEXT: LBB20_14: ## %else17
2640 ; SSE2-NEXT: testb $-128, %al
2641 ; SSE2-NEXT: jne LBB20_15
2642 ; SSE2-NEXT: LBB20_16: ## %else20
2644 ; SSE2-NEXT: LBB20_1: ## %cond.load
2645 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2646 ; SSE2-NEXT: testb $2, %al
2647 ; SSE2-NEXT: je LBB20_4
2648 ; SSE2-NEXT: LBB20_3: ## %cond.load1
2649 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2650 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2651 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
2652 ; SSE2-NEXT: movaps %xmm2, %xmm0
2653 ; SSE2-NEXT: testb $4, %al
2654 ; SSE2-NEXT: je LBB20_6
2655 ; SSE2-NEXT: LBB20_5: ## %cond.load4
2656 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2657 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
2658 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
2659 ; SSE2-NEXT: testb $8, %al
2660 ; SSE2-NEXT: je LBB20_8
2661 ; SSE2-NEXT: LBB20_7: ## %cond.load7
2662 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2663 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
2664 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
2665 ; SSE2-NEXT: testb $16, %al
2666 ; SSE2-NEXT: je LBB20_10
2667 ; SSE2-NEXT: LBB20_9: ## %cond.load10
2668 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2669 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
2670 ; SSE2-NEXT: testb $32, %al
2671 ; SSE2-NEXT: je LBB20_12
2672 ; SSE2-NEXT: LBB20_11: ## %cond.load13
2673 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2674 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2675 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
2676 ; SSE2-NEXT: movaps %xmm2, %xmm1
2677 ; SSE2-NEXT: testb $64, %al
2678 ; SSE2-NEXT: je LBB20_14
2679 ; SSE2-NEXT: LBB20_13: ## %cond.load16
2680 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2681 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
2682 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
2683 ; SSE2-NEXT: testb $-128, %al
2684 ; SSE2-NEXT: je LBB20_16
2685 ; SSE2-NEXT: LBB20_15: ## %cond.load19
2686 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2687 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
2688 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
2691 ; SSE42-LABEL: load_v8i32_v8i1_zero:
2693 ; SSE42-NEXT: psllw $15, %xmm0
2694 ; SSE42-NEXT: packsswb %xmm0, %xmm0
2695 ; SSE42-NEXT: pmovmskb %xmm0, %eax
2696 ; SSE42-NEXT: pxor %xmm0, %xmm0
2697 ; SSE42-NEXT: testb $1, %al
2698 ; SSE42-NEXT: pxor %xmm1, %xmm1
2699 ; SSE42-NEXT: jne LBB20_1
2700 ; SSE42-NEXT: ## %bb.2: ## %else
2701 ; SSE42-NEXT: testb $2, %al
2702 ; SSE42-NEXT: jne LBB20_3
2703 ; SSE42-NEXT: LBB20_4: ## %else2
2704 ; SSE42-NEXT: testb $4, %al
2705 ; SSE42-NEXT: jne LBB20_5
2706 ; SSE42-NEXT: LBB20_6: ## %else5
2707 ; SSE42-NEXT: testb $8, %al
2708 ; SSE42-NEXT: jne LBB20_7
2709 ; SSE42-NEXT: LBB20_8: ## %else8
2710 ; SSE42-NEXT: testb $16, %al
2711 ; SSE42-NEXT: jne LBB20_9
2712 ; SSE42-NEXT: LBB20_10: ## %else11
2713 ; SSE42-NEXT: testb $32, %al
2714 ; SSE42-NEXT: jne LBB20_11
2715 ; SSE42-NEXT: LBB20_12: ## %else14
2716 ; SSE42-NEXT: testb $64, %al
2717 ; SSE42-NEXT: jne LBB20_13
2718 ; SSE42-NEXT: LBB20_14: ## %else17
2719 ; SSE42-NEXT: testb $-128, %al
2720 ; SSE42-NEXT: jne LBB20_15
2721 ; SSE42-NEXT: LBB20_16: ## %else20
2723 ; SSE42-NEXT: LBB20_1: ## %cond.load
2724 ; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2725 ; SSE42-NEXT: testb $2, %al
2726 ; SSE42-NEXT: je LBB20_4
2727 ; SSE42-NEXT: LBB20_3: ## %cond.load1
2728 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
2729 ; SSE42-NEXT: testb $4, %al
2730 ; SSE42-NEXT: je LBB20_6
2731 ; SSE42-NEXT: LBB20_5: ## %cond.load4
2732 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
2733 ; SSE42-NEXT: testb $8, %al
2734 ; SSE42-NEXT: je LBB20_8
2735 ; SSE42-NEXT: LBB20_7: ## %cond.load7
2736 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0
2737 ; SSE42-NEXT: testb $16, %al
2738 ; SSE42-NEXT: je LBB20_10
2739 ; SSE42-NEXT: LBB20_9: ## %cond.load10
2740 ; SSE42-NEXT: pinsrd $0, 16(%rdi), %xmm1
2741 ; SSE42-NEXT: testb $32, %al
2742 ; SSE42-NEXT: je LBB20_12
2743 ; SSE42-NEXT: LBB20_11: ## %cond.load13
2744 ; SSE42-NEXT: pinsrd $1, 20(%rdi), %xmm1
2745 ; SSE42-NEXT: testb $64, %al
2746 ; SSE42-NEXT: je LBB20_14
2747 ; SSE42-NEXT: LBB20_13: ## %cond.load16
2748 ; SSE42-NEXT: pinsrd $2, 24(%rdi), %xmm1
2749 ; SSE42-NEXT: testb $-128, %al
2750 ; SSE42-NEXT: je LBB20_16
2751 ; SSE42-NEXT: LBB20_15: ## %cond.load19
2752 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1
2755 ; AVX1-LABEL: load_v8i32_v8i1_zero:
2757 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2758 ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
2759 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
2760 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
2761 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2762 ; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
2765 ; AVX2-LABEL: load_v8i32_v8i1_zero:
2767 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2768 ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
2769 ; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
2772 ; AVX512F-LABEL: load_v8i32_v8i1_zero:
2773 ; AVX512F: ## %bb.0:
2774 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
2775 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
2776 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
2777 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
2778 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
2779 ; AVX512F-NEXT: retq
2781 ; AVX512VLDQ-LABEL: load_v8i32_v8i1_zero:
2782 ; AVX512VLDQ: ## %bb.0:
2783 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
2784 ; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
2785 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1
2786 ; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
2787 ; AVX512VLDQ-NEXT: retq
2789 ; AVX512VLBW-LABEL: load_v8i32_v8i1_zero:
2790 ; AVX512VLBW: ## %bb.0:
2791 ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0
2792 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
2793 ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
2794 ; AVX512VLBW-NEXT: retq
2796 ; X86-AVX512-LABEL: load_v8i32_v8i1_zero:
2797 ; X86-AVX512: ## %bb.0:
2798 ; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0
2799 ; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
2800 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
2801 ; X86-AVX512-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} {z}
2802 ; X86-AVX512-NEXT: retl
2803 %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
2811 define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %dst) {
2812 ; SSE-LABEL: load_v8i16_v8i16:
2814 ; SSE-NEXT: packsswb %xmm0, %xmm0
2815 ; SSE-NEXT: pmovmskb %xmm0, %eax
2816 ; SSE-NEXT: testb $1, %al
2817 ; SSE-NEXT: jne LBB21_1
2818 ; SSE-NEXT: ## %bb.2: ## %else
2819 ; SSE-NEXT: testb $2, %al
2820 ; SSE-NEXT: jne LBB21_3
2821 ; SSE-NEXT: LBB21_4: ## %else2
2822 ; SSE-NEXT: testb $4, %al
2823 ; SSE-NEXT: jne LBB21_5
2824 ; SSE-NEXT: LBB21_6: ## %else5
2825 ; SSE-NEXT: testb $8, %al
2826 ; SSE-NEXT: jne LBB21_7
2827 ; SSE-NEXT: LBB21_8: ## %else8
2828 ; SSE-NEXT: testb $16, %al
2829 ; SSE-NEXT: jne LBB21_9
2830 ; SSE-NEXT: LBB21_10: ## %else11
2831 ; SSE-NEXT: testb $32, %al
2832 ; SSE-NEXT: jne LBB21_11
2833 ; SSE-NEXT: LBB21_12: ## %else14
2834 ; SSE-NEXT: testb $64, %al
2835 ; SSE-NEXT: jne LBB21_13
2836 ; SSE-NEXT: LBB21_14: ## %else17
2837 ; SSE-NEXT: testb $-128, %al
2838 ; SSE-NEXT: jne LBB21_15
2839 ; SSE-NEXT: LBB21_16: ## %else20
2840 ; SSE-NEXT: movdqa %xmm1, %xmm0
2842 ; SSE-NEXT: LBB21_1: ## %cond.load
2843 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm1
2844 ; SSE-NEXT: testb $2, %al
2845 ; SSE-NEXT: je LBB21_4
2846 ; SSE-NEXT: LBB21_3: ## %cond.load1
2847 ; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm1
2848 ; SSE-NEXT: testb $4, %al
2849 ; SSE-NEXT: je LBB21_6
2850 ; SSE-NEXT: LBB21_5: ## %cond.load4
2851 ; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm1
2852 ; SSE-NEXT: testb $8, %al
2853 ; SSE-NEXT: je LBB21_8
2854 ; SSE-NEXT: LBB21_7: ## %cond.load7
2855 ; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm1
2856 ; SSE-NEXT: testb $16, %al
2857 ; SSE-NEXT: je LBB21_10
2858 ; SSE-NEXT: LBB21_9: ## %cond.load10
2859 ; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm1
2860 ; SSE-NEXT: testb $32, %al
2861 ; SSE-NEXT: je LBB21_12
2862 ; SSE-NEXT: LBB21_11: ## %cond.load13
2863 ; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm1
2864 ; SSE-NEXT: testb $64, %al
2865 ; SSE-NEXT: je LBB21_14
2866 ; SSE-NEXT: LBB21_13: ## %cond.load16
2867 ; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm1
2868 ; SSE-NEXT: testb $-128, %al
2869 ; SSE-NEXT: je LBB21_16
2870 ; SSE-NEXT: LBB21_15: ## %cond.load19
2871 ; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm1
2872 ; SSE-NEXT: movdqa %xmm1, %xmm0
2875 ; AVX1OR2-LABEL: load_v8i16_v8i16:
2876 ; AVX1OR2: ## %bb.0:
2877 ; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
2878 ; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
2879 ; AVX1OR2-NEXT: testb $1, %al
2880 ; AVX1OR2-NEXT: jne LBB21_1
2881 ; AVX1OR2-NEXT: ## %bb.2: ## %else
2882 ; AVX1OR2-NEXT: testb $2, %al
2883 ; AVX1OR2-NEXT: jne LBB21_3
2884 ; AVX1OR2-NEXT: LBB21_4: ## %else2
2885 ; AVX1OR2-NEXT: testb $4, %al
2886 ; AVX1OR2-NEXT: jne LBB21_5
2887 ; AVX1OR2-NEXT: LBB21_6: ## %else5
2888 ; AVX1OR2-NEXT: testb $8, %al
2889 ; AVX1OR2-NEXT: jne LBB21_7
2890 ; AVX1OR2-NEXT: LBB21_8: ## %else8
2891 ; AVX1OR2-NEXT: testb $16, %al
2892 ; AVX1OR2-NEXT: jne LBB21_9
2893 ; AVX1OR2-NEXT: LBB21_10: ## %else11
2894 ; AVX1OR2-NEXT: testb $32, %al
2895 ; AVX1OR2-NEXT: jne LBB21_11
2896 ; AVX1OR2-NEXT: LBB21_12: ## %else14
2897 ; AVX1OR2-NEXT: testb $64, %al
2898 ; AVX1OR2-NEXT: jne LBB21_13
2899 ; AVX1OR2-NEXT: LBB21_14: ## %else17
2900 ; AVX1OR2-NEXT: testb $-128, %al
2901 ; AVX1OR2-NEXT: jne LBB21_15
2902 ; AVX1OR2-NEXT: LBB21_16: ## %else20
2903 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
2904 ; AVX1OR2-NEXT: retq
2905 ; AVX1OR2-NEXT: LBB21_1: ## %cond.load
2906 ; AVX1OR2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
2907 ; AVX1OR2-NEXT: testb $2, %al
2908 ; AVX1OR2-NEXT: je LBB21_4
2909 ; AVX1OR2-NEXT: LBB21_3: ## %cond.load1
2910 ; AVX1OR2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
2911 ; AVX1OR2-NEXT: testb $4, %al
2912 ; AVX1OR2-NEXT: je LBB21_6
2913 ; AVX1OR2-NEXT: LBB21_5: ## %cond.load4
2914 ; AVX1OR2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
2915 ; AVX1OR2-NEXT: testb $8, %al
2916 ; AVX1OR2-NEXT: je LBB21_8
2917 ; AVX1OR2-NEXT: LBB21_7: ## %cond.load7
2918 ; AVX1OR2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
2919 ; AVX1OR2-NEXT: testb $16, %al
2920 ; AVX1OR2-NEXT: je LBB21_10
2921 ; AVX1OR2-NEXT: LBB21_9: ## %cond.load10
2922 ; AVX1OR2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
2923 ; AVX1OR2-NEXT: testb $32, %al
2924 ; AVX1OR2-NEXT: je LBB21_12
2925 ; AVX1OR2-NEXT: LBB21_11: ## %cond.load13
2926 ; AVX1OR2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
2927 ; AVX1OR2-NEXT: testb $64, %al
2928 ; AVX1OR2-NEXT: je LBB21_14
2929 ; AVX1OR2-NEXT: LBB21_13: ## %cond.load16
2930 ; AVX1OR2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
2931 ; AVX1OR2-NEXT: testb $-128, %al
2932 ; AVX1OR2-NEXT: je LBB21_16
2933 ; AVX1OR2-NEXT: LBB21_15: ## %cond.load19
2934 ; AVX1OR2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
2935 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
2936 ; AVX1OR2-NEXT: retq
2938 ; AVX512F-LABEL: load_v8i16_v8i16:
2939 ; AVX512F: ## %bb.0:
2940 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2941 ; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
2942 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
2943 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
2944 ; AVX512F-NEXT: kmovw %k0, %eax
2945 ; AVX512F-NEXT: testb $1, %al
2946 ; AVX512F-NEXT: jne LBB21_1
2947 ; AVX512F-NEXT: ## %bb.2: ## %else
2948 ; AVX512F-NEXT: testb $2, %al
2949 ; AVX512F-NEXT: jne LBB21_3
2950 ; AVX512F-NEXT: LBB21_4: ## %else2
2951 ; AVX512F-NEXT: testb $4, %al
2952 ; AVX512F-NEXT: jne LBB21_5
2953 ; AVX512F-NEXT: LBB21_6: ## %else5
2954 ; AVX512F-NEXT: testb $8, %al
2955 ; AVX512F-NEXT: jne LBB21_7
2956 ; AVX512F-NEXT: LBB21_8: ## %else8
2957 ; AVX512F-NEXT: testb $16, %al
2958 ; AVX512F-NEXT: jne LBB21_9
2959 ; AVX512F-NEXT: LBB21_10: ## %else11
2960 ; AVX512F-NEXT: testb $32, %al
2961 ; AVX512F-NEXT: jne LBB21_11
2962 ; AVX512F-NEXT: LBB21_12: ## %else14
2963 ; AVX512F-NEXT: testb $64, %al
2964 ; AVX512F-NEXT: jne LBB21_13
2965 ; AVX512F-NEXT: LBB21_14: ## %else17
2966 ; AVX512F-NEXT: testb $-128, %al
2967 ; AVX512F-NEXT: jne LBB21_15
2968 ; AVX512F-NEXT: LBB21_16: ## %else20
2969 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
2970 ; AVX512F-NEXT: vzeroupper
2971 ; AVX512F-NEXT: retq
2972 ; AVX512F-NEXT: LBB21_1: ## %cond.load
2973 ; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
2974 ; AVX512F-NEXT: testb $2, %al
2975 ; AVX512F-NEXT: je LBB21_4
2976 ; AVX512F-NEXT: LBB21_3: ## %cond.load1
2977 ; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
2978 ; AVX512F-NEXT: testb $4, %al
2979 ; AVX512F-NEXT: je LBB21_6
2980 ; AVX512F-NEXT: LBB21_5: ## %cond.load4
2981 ; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
2982 ; AVX512F-NEXT: testb $8, %al
2983 ; AVX512F-NEXT: je LBB21_8
2984 ; AVX512F-NEXT: LBB21_7: ## %cond.load7
2985 ; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
2986 ; AVX512F-NEXT: testb $16, %al
2987 ; AVX512F-NEXT: je LBB21_10
2988 ; AVX512F-NEXT: LBB21_9: ## %cond.load10
2989 ; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
2990 ; AVX512F-NEXT: testb $32, %al
2991 ; AVX512F-NEXT: je LBB21_12
2992 ; AVX512F-NEXT: LBB21_11: ## %cond.load13
2993 ; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
2994 ; AVX512F-NEXT: testb $64, %al
2995 ; AVX512F-NEXT: je LBB21_14
2996 ; AVX512F-NEXT: LBB21_13: ## %cond.load16
2997 ; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
2998 ; AVX512F-NEXT: testb $-128, %al
2999 ; AVX512F-NEXT: je LBB21_16
3000 ; AVX512F-NEXT: LBB21_15: ## %cond.load19
3001 ; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
3002 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
3003 ; AVX512F-NEXT: vzeroupper
3004 ; AVX512F-NEXT: retq
3006 ; AVX512VLDQ-LABEL: load_v8i16_v8i16:
3007 ; AVX512VLDQ: ## %bb.0:
3008 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
3009 ; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
3010 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
3011 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0
3012 ; AVX512VLDQ-NEXT: kmovw %k0, %eax
3013 ; AVX512VLDQ-NEXT: testb $1, %al
3014 ; AVX512VLDQ-NEXT: jne LBB21_1
3015 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
3016 ; AVX512VLDQ-NEXT: testb $2, %al
3017 ; AVX512VLDQ-NEXT: jne LBB21_3
3018 ; AVX512VLDQ-NEXT: LBB21_4: ## %else2
3019 ; AVX512VLDQ-NEXT: testb $4, %al
3020 ; AVX512VLDQ-NEXT: jne LBB21_5
3021 ; AVX512VLDQ-NEXT: LBB21_6: ## %else5
3022 ; AVX512VLDQ-NEXT: testb $8, %al
3023 ; AVX512VLDQ-NEXT: jne LBB21_7
3024 ; AVX512VLDQ-NEXT: LBB21_8: ## %else8
3025 ; AVX512VLDQ-NEXT: testb $16, %al
3026 ; AVX512VLDQ-NEXT: jne LBB21_9
3027 ; AVX512VLDQ-NEXT: LBB21_10: ## %else11
3028 ; AVX512VLDQ-NEXT: testb $32, %al
3029 ; AVX512VLDQ-NEXT: jne LBB21_11
3030 ; AVX512VLDQ-NEXT: LBB21_12: ## %else14
3031 ; AVX512VLDQ-NEXT: testb $64, %al
3032 ; AVX512VLDQ-NEXT: jne LBB21_13
3033 ; AVX512VLDQ-NEXT: LBB21_14: ## %else17
3034 ; AVX512VLDQ-NEXT: testb $-128, %al
3035 ; AVX512VLDQ-NEXT: jne LBB21_15
3036 ; AVX512VLDQ-NEXT: LBB21_16: ## %else20
3037 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
3038 ; AVX512VLDQ-NEXT: vzeroupper
3039 ; AVX512VLDQ-NEXT: retq
3040 ; AVX512VLDQ-NEXT: LBB21_1: ## %cond.load
3041 ; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1
3042 ; AVX512VLDQ-NEXT: testb $2, %al
3043 ; AVX512VLDQ-NEXT: je LBB21_4
3044 ; AVX512VLDQ-NEXT: LBB21_3: ## %cond.load1
3045 ; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
3046 ; AVX512VLDQ-NEXT: testb $4, %al
3047 ; AVX512VLDQ-NEXT: je LBB21_6
3048 ; AVX512VLDQ-NEXT: LBB21_5: ## %cond.load4
3049 ; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
3050 ; AVX512VLDQ-NEXT: testb $8, %al
3051 ; AVX512VLDQ-NEXT: je LBB21_8
3052 ; AVX512VLDQ-NEXT: LBB21_7: ## %cond.load7
3053 ; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
3054 ; AVX512VLDQ-NEXT: testb $16, %al
3055 ; AVX512VLDQ-NEXT: je LBB21_10
3056 ; AVX512VLDQ-NEXT: LBB21_9: ## %cond.load10
3057 ; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
3058 ; AVX512VLDQ-NEXT: testb $32, %al
3059 ; AVX512VLDQ-NEXT: je LBB21_12
3060 ; AVX512VLDQ-NEXT: LBB21_11: ## %cond.load13
3061 ; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
3062 ; AVX512VLDQ-NEXT: testb $64, %al
3063 ; AVX512VLDQ-NEXT: je LBB21_14
3064 ; AVX512VLDQ-NEXT: LBB21_13: ## %cond.load16
3065 ; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
3066 ; AVX512VLDQ-NEXT: testb $-128, %al
3067 ; AVX512VLDQ-NEXT: je LBB21_16
3068 ; AVX512VLDQ-NEXT: LBB21_15: ## %cond.load19
3069 ; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
3070 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
3071 ; AVX512VLDQ-NEXT: vzeroupper
3072 ; AVX512VLDQ-NEXT: retq
3074 ; AVX512VLBW-LABEL: load_v8i16_v8i16:
3075 ; AVX512VLBW: ## %bb.0:
3076 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
3077 ; AVX512VLBW-NEXT: vpblendmw (%rdi), %xmm1, %xmm0 {%k1}
3078 ; AVX512VLBW-NEXT: retq
3080 ; X86-AVX512-LABEL: load_v8i16_v8i16:
3081 ; X86-AVX512: ## %bb.0:
3082 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
3083 ; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
3084 ; X86-AVX512-NEXT: vpblendmw (%eax), %xmm1, %xmm0 {%k1}
3085 ; X86-AVX512-NEXT: retl
3086 %mask = icmp slt <8 x i16> %trigger, zeroinitializer
3087 %res = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x i16> %dst)
3091 define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %dst) {
3092 ; SSE-LABEL: load_v16i16_v16i16:
3094 ; SSE-NEXT: packsswb %xmm1, %xmm0
3095 ; SSE-NEXT: pmovmskb %xmm0, %eax
3096 ; SSE-NEXT: testb $1, %al
3097 ; SSE-NEXT: jne LBB22_1
3098 ; SSE-NEXT: ## %bb.2: ## %else
3099 ; SSE-NEXT: testb $2, %al
3100 ; SSE-NEXT: jne LBB22_3
3101 ; SSE-NEXT: LBB22_4: ## %else2
3102 ; SSE-NEXT: testb $4, %al
3103 ; SSE-NEXT: jne LBB22_5
3104 ; SSE-NEXT: LBB22_6: ## %else5
3105 ; SSE-NEXT: testb $8, %al
3106 ; SSE-NEXT: jne LBB22_7
3107 ; SSE-NEXT: LBB22_8: ## %else8
3108 ; SSE-NEXT: testb $16, %al
3109 ; SSE-NEXT: jne LBB22_9
3110 ; SSE-NEXT: LBB22_10: ## %else11
3111 ; SSE-NEXT: testb $32, %al
3112 ; SSE-NEXT: jne LBB22_11
3113 ; SSE-NEXT: LBB22_12: ## %else14
3114 ; SSE-NEXT: testb $64, %al
3115 ; SSE-NEXT: jne LBB22_13
3116 ; SSE-NEXT: LBB22_14: ## %else17
3117 ; SSE-NEXT: testb %al, %al
3118 ; SSE-NEXT: js LBB22_15
3119 ; SSE-NEXT: LBB22_16: ## %else20
3120 ; SSE-NEXT: testl $256, %eax ## imm = 0x100
3121 ; SSE-NEXT: jne LBB22_17
3122 ; SSE-NEXT: LBB22_18: ## %else23
3123 ; SSE-NEXT: testl $512, %eax ## imm = 0x200
3124 ; SSE-NEXT: jne LBB22_19
3125 ; SSE-NEXT: LBB22_20: ## %else26
3126 ; SSE-NEXT: testl $1024, %eax ## imm = 0x400
3127 ; SSE-NEXT: jne LBB22_21
3128 ; SSE-NEXT: LBB22_22: ## %else29
3129 ; SSE-NEXT: testl $2048, %eax ## imm = 0x800
3130 ; SSE-NEXT: jne LBB22_23
3131 ; SSE-NEXT: LBB22_24: ## %else32
3132 ; SSE-NEXT: testl $4096, %eax ## imm = 0x1000
3133 ; SSE-NEXT: jne LBB22_25
3134 ; SSE-NEXT: LBB22_26: ## %else35
3135 ; SSE-NEXT: testl $8192, %eax ## imm = 0x2000
3136 ; SSE-NEXT: jne LBB22_27
3137 ; SSE-NEXT: LBB22_28: ## %else38
3138 ; SSE-NEXT: testl $16384, %eax ## imm = 0x4000
3139 ; SSE-NEXT: jne LBB22_29
3140 ; SSE-NEXT: LBB22_30: ## %else41
3141 ; SSE-NEXT: testl $32768, %eax ## imm = 0x8000
3142 ; SSE-NEXT: je LBB22_32
3143 ; SSE-NEXT: LBB22_31: ## %cond.load43
3144 ; SSE-NEXT: pinsrw $7, 30(%rdi), %xmm3
3145 ; SSE-NEXT: LBB22_32: ## %else44
3146 ; SSE-NEXT: movdqa %xmm2, %xmm0
3147 ; SSE-NEXT: movdqa %xmm3, %xmm1
3149 ; SSE-NEXT: LBB22_1: ## %cond.load
3150 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm2
3151 ; SSE-NEXT: testb $2, %al
3152 ; SSE-NEXT: je LBB22_4
3153 ; SSE-NEXT: LBB22_3: ## %cond.load1
3154 ; SSE-NEXT: pinsrw $1, 2(%rdi), %xmm2
3155 ; SSE-NEXT: testb $4, %al
3156 ; SSE-NEXT: je LBB22_6
3157 ; SSE-NEXT: LBB22_5: ## %cond.load4
3158 ; SSE-NEXT: pinsrw $2, 4(%rdi), %xmm2
3159 ; SSE-NEXT: testb $8, %al
3160 ; SSE-NEXT: je LBB22_8
3161 ; SSE-NEXT: LBB22_7: ## %cond.load7
3162 ; SSE-NEXT: pinsrw $3, 6(%rdi), %xmm2
3163 ; SSE-NEXT: testb $16, %al
3164 ; SSE-NEXT: je LBB22_10
3165 ; SSE-NEXT: LBB22_9: ## %cond.load10
3166 ; SSE-NEXT: pinsrw $4, 8(%rdi), %xmm2
3167 ; SSE-NEXT: testb $32, %al
3168 ; SSE-NEXT: je LBB22_12
3169 ; SSE-NEXT: LBB22_11: ## %cond.load13
3170 ; SSE-NEXT: pinsrw $5, 10(%rdi), %xmm2
3171 ; SSE-NEXT: testb $64, %al
3172 ; SSE-NEXT: je LBB22_14
3173 ; SSE-NEXT: LBB22_13: ## %cond.load16
3174 ; SSE-NEXT: pinsrw $6, 12(%rdi), %xmm2
3175 ; SSE-NEXT: testb %al, %al
3176 ; SSE-NEXT: jns LBB22_16
3177 ; SSE-NEXT: LBB22_15: ## %cond.load19
3178 ; SSE-NEXT: pinsrw $7, 14(%rdi), %xmm2
3179 ; SSE-NEXT: testl $256, %eax ## imm = 0x100
3180 ; SSE-NEXT: je LBB22_18
3181 ; SSE-NEXT: LBB22_17: ## %cond.load22
3182 ; SSE-NEXT: pinsrw $0, 16(%rdi), %xmm3
3183 ; SSE-NEXT: testl $512, %eax ## imm = 0x200
3184 ; SSE-NEXT: je LBB22_20
3185 ; SSE-NEXT: LBB22_19: ## %cond.load25
3186 ; SSE-NEXT: pinsrw $1, 18(%rdi), %xmm3
3187 ; SSE-NEXT: testl $1024, %eax ## imm = 0x400
3188 ; SSE-NEXT: je LBB22_22
3189 ; SSE-NEXT: LBB22_21: ## %cond.load28
3190 ; SSE-NEXT: pinsrw $2, 20(%rdi), %xmm3
3191 ; SSE-NEXT: testl $2048, %eax ## imm = 0x800
3192 ; SSE-NEXT: je LBB22_24
3193 ; SSE-NEXT: LBB22_23: ## %cond.load31
3194 ; SSE-NEXT: pinsrw $3, 22(%rdi), %xmm3
3195 ; SSE-NEXT: testl $4096, %eax ## imm = 0x1000
3196 ; SSE-NEXT: je LBB22_26
3197 ; SSE-NEXT: LBB22_25: ## %cond.load34
3198 ; SSE-NEXT: pinsrw $4, 24(%rdi), %xmm3
3199 ; SSE-NEXT: testl $8192, %eax ## imm = 0x2000
3200 ; SSE-NEXT: je LBB22_28
3201 ; SSE-NEXT: LBB22_27: ## %cond.load37
3202 ; SSE-NEXT: pinsrw $5, 26(%rdi), %xmm3
3203 ; SSE-NEXT: testl $16384, %eax ## imm = 0x4000
3204 ; SSE-NEXT: je LBB22_30
3205 ; SSE-NEXT: LBB22_29: ## %cond.load40
3206 ; SSE-NEXT: pinsrw $6, 28(%rdi), %xmm3
3207 ; SSE-NEXT: testl $32768, %eax ## imm = 0x8000
3208 ; SSE-NEXT: jne LBB22_31
3209 ; SSE-NEXT: jmp LBB22_32
3211 ; AVX1-LABEL: load_v16i16_v16i16:
3213 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
3214 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
3215 ; AVX1-NEXT: vpmovmskb %xmm0, %eax
3216 ; AVX1-NEXT: testb $1, %al
3217 ; AVX1-NEXT: jne LBB22_1
3218 ; AVX1-NEXT: ## %bb.2: ## %else
3219 ; AVX1-NEXT: testb $2, %al
3220 ; AVX1-NEXT: jne LBB22_3
3221 ; AVX1-NEXT: LBB22_4: ## %else2
3222 ; AVX1-NEXT: testb $4, %al
3223 ; AVX1-NEXT: jne LBB22_5
3224 ; AVX1-NEXT: LBB22_6: ## %else5
3225 ; AVX1-NEXT: testb $8, %al
3226 ; AVX1-NEXT: jne LBB22_7
3227 ; AVX1-NEXT: LBB22_8: ## %else8
3228 ; AVX1-NEXT: testb $16, %al
3229 ; AVX1-NEXT: jne LBB22_9
3230 ; AVX1-NEXT: LBB22_10: ## %else11
3231 ; AVX1-NEXT: testb $32, %al
3232 ; AVX1-NEXT: jne LBB22_11
3233 ; AVX1-NEXT: LBB22_12: ## %else14
3234 ; AVX1-NEXT: testb $64, %al
3235 ; AVX1-NEXT: jne LBB22_13
3236 ; AVX1-NEXT: LBB22_14: ## %else17
3237 ; AVX1-NEXT: testb %al, %al
3238 ; AVX1-NEXT: js LBB22_15
3239 ; AVX1-NEXT: LBB22_16: ## %else20
3240 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
3241 ; AVX1-NEXT: jne LBB22_17
3242 ; AVX1-NEXT: LBB22_18: ## %else23
3243 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
3244 ; AVX1-NEXT: jne LBB22_19
3245 ; AVX1-NEXT: LBB22_20: ## %else26
3246 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
3247 ; AVX1-NEXT: jne LBB22_21
3248 ; AVX1-NEXT: LBB22_22: ## %else29
3249 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
3250 ; AVX1-NEXT: jne LBB22_23
3251 ; AVX1-NEXT: LBB22_24: ## %else32
3252 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
3253 ; AVX1-NEXT: jne LBB22_25
3254 ; AVX1-NEXT: LBB22_26: ## %else35
3255 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
3256 ; AVX1-NEXT: jne LBB22_27
3257 ; AVX1-NEXT: LBB22_28: ## %else38
3258 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
3259 ; AVX1-NEXT: jne LBB22_29
3260 ; AVX1-NEXT: LBB22_30: ## %else41
3261 ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
3262 ; AVX1-NEXT: jne LBB22_31
3263 ; AVX1-NEXT: LBB22_32: ## %else44
3264 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
3266 ; AVX1-NEXT: LBB22_1: ## %cond.load
3267 ; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3268 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3269 ; AVX1-NEXT: testb $2, %al
3270 ; AVX1-NEXT: je LBB22_4
3271 ; AVX1-NEXT: LBB22_3: ## %cond.load1
3272 ; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3273 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3274 ; AVX1-NEXT: testb $4, %al
3275 ; AVX1-NEXT: je LBB22_6
3276 ; AVX1-NEXT: LBB22_5: ## %cond.load4
3277 ; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3278 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3279 ; AVX1-NEXT: testb $8, %al
3280 ; AVX1-NEXT: je LBB22_8
3281 ; AVX1-NEXT: LBB22_7: ## %cond.load7
3282 ; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3283 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3284 ; AVX1-NEXT: testb $16, %al
3285 ; AVX1-NEXT: je LBB22_10
3286 ; AVX1-NEXT: LBB22_9: ## %cond.load10
3287 ; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3288 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3289 ; AVX1-NEXT: testb $32, %al
3290 ; AVX1-NEXT: je LBB22_12
3291 ; AVX1-NEXT: LBB22_11: ## %cond.load13
3292 ; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3293 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3294 ; AVX1-NEXT: testb $64, %al
3295 ; AVX1-NEXT: je LBB22_14
3296 ; AVX1-NEXT: LBB22_13: ## %cond.load16
3297 ; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3298 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3299 ; AVX1-NEXT: testb %al, %al
3300 ; AVX1-NEXT: jns LBB22_16
3301 ; AVX1-NEXT: LBB22_15: ## %cond.load19
3302 ; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3303 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3304 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
3305 ; AVX1-NEXT: je LBB22_18
3306 ; AVX1-NEXT: LBB22_17: ## %cond.load22
3307 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3308 ; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0
3309 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3310 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
3311 ; AVX1-NEXT: je LBB22_20
3312 ; AVX1-NEXT: LBB22_19: ## %cond.load25
3313 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3314 ; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0
3315 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3316 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
3317 ; AVX1-NEXT: je LBB22_22
3318 ; AVX1-NEXT: LBB22_21: ## %cond.load28
3319 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3320 ; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0
3321 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3322 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
3323 ; AVX1-NEXT: je LBB22_24
3324 ; AVX1-NEXT: LBB22_23: ## %cond.load31
3325 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3326 ; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0
3327 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3328 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
3329 ; AVX1-NEXT: je LBB22_26
3330 ; AVX1-NEXT: LBB22_25: ## %cond.load34
3331 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3332 ; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
3333 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3334 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
3335 ; AVX1-NEXT: je LBB22_28
3336 ; AVX1-NEXT: LBB22_27: ## %cond.load37
3337 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3338 ; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0
3339 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3340 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
3341 ; AVX1-NEXT: je LBB22_30
3342 ; AVX1-NEXT: LBB22_29: ## %cond.load40
3343 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3344 ; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
3345 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3346 ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
3347 ; AVX1-NEXT: je LBB22_32
3348 ; AVX1-NEXT: LBB22_31: ## %cond.load43
3349 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
3350 ; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
3351 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
3352 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
3355 ; AVX2-LABEL: load_v16i16_v16i16:
3357 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
3358 ; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
3359 ; AVX2-NEXT: vpmovmskb %xmm0, %eax
3360 ; AVX2-NEXT: testb $1, %al
3361 ; AVX2-NEXT: jne LBB22_1
3362 ; AVX2-NEXT: ## %bb.2: ## %else
3363 ; AVX2-NEXT: testb $2, %al
3364 ; AVX2-NEXT: jne LBB22_3
3365 ; AVX2-NEXT: LBB22_4: ## %else2
3366 ; AVX2-NEXT: testb $4, %al
3367 ; AVX2-NEXT: jne LBB22_5
3368 ; AVX2-NEXT: LBB22_6: ## %else5
3369 ; AVX2-NEXT: testb $8, %al
3370 ; AVX2-NEXT: jne LBB22_7
3371 ; AVX2-NEXT: LBB22_8: ## %else8
3372 ; AVX2-NEXT: testb $16, %al
3373 ; AVX2-NEXT: jne LBB22_9
3374 ; AVX2-NEXT: LBB22_10: ## %else11
3375 ; AVX2-NEXT: testb $32, %al
3376 ; AVX2-NEXT: jne LBB22_11
3377 ; AVX2-NEXT: LBB22_12: ## %else14
3378 ; AVX2-NEXT: testb $64, %al
3379 ; AVX2-NEXT: jne LBB22_13
3380 ; AVX2-NEXT: LBB22_14: ## %else17
3381 ; AVX2-NEXT: testb %al, %al
3382 ; AVX2-NEXT: js LBB22_15
3383 ; AVX2-NEXT: LBB22_16: ## %else20
3384 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
3385 ; AVX2-NEXT: jne LBB22_17
3386 ; AVX2-NEXT: LBB22_18: ## %else23
3387 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
3388 ; AVX2-NEXT: jne LBB22_19
3389 ; AVX2-NEXT: LBB22_20: ## %else26
3390 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
3391 ; AVX2-NEXT: jne LBB22_21
3392 ; AVX2-NEXT: LBB22_22: ## %else29
3393 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
3394 ; AVX2-NEXT: jne LBB22_23
3395 ; AVX2-NEXT: LBB22_24: ## %else32
3396 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
3397 ; AVX2-NEXT: jne LBB22_25
3398 ; AVX2-NEXT: LBB22_26: ## %else35
3399 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
3400 ; AVX2-NEXT: jne LBB22_27
3401 ; AVX2-NEXT: LBB22_28: ## %else38
3402 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
3403 ; AVX2-NEXT: jne LBB22_29
3404 ; AVX2-NEXT: LBB22_30: ## %else41
3405 ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
3406 ; AVX2-NEXT: jne LBB22_31
3407 ; AVX2-NEXT: LBB22_32: ## %else44
3408 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
3410 ; AVX2-NEXT: LBB22_1: ## %cond.load
3411 ; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3412 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3413 ; AVX2-NEXT: testb $2, %al
3414 ; AVX2-NEXT: je LBB22_4
3415 ; AVX2-NEXT: LBB22_3: ## %cond.load1
3416 ; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3417 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3418 ; AVX2-NEXT: testb $4, %al
3419 ; AVX2-NEXT: je LBB22_6
3420 ; AVX2-NEXT: LBB22_5: ## %cond.load4
3421 ; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3422 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3423 ; AVX2-NEXT: testb $8, %al
3424 ; AVX2-NEXT: je LBB22_8
3425 ; AVX2-NEXT: LBB22_7: ## %cond.load7
3426 ; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3427 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3428 ; AVX2-NEXT: testb $16, %al
3429 ; AVX2-NEXT: je LBB22_10
3430 ; AVX2-NEXT: LBB22_9: ## %cond.load10
3431 ; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3432 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3433 ; AVX2-NEXT: testb $32, %al
3434 ; AVX2-NEXT: je LBB22_12
3435 ; AVX2-NEXT: LBB22_11: ## %cond.load13
3436 ; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3437 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3438 ; AVX2-NEXT: testb $64, %al
3439 ; AVX2-NEXT: je LBB22_14
3440 ; AVX2-NEXT: LBB22_13: ## %cond.load16
3441 ; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3442 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3443 ; AVX2-NEXT: testb %al, %al
3444 ; AVX2-NEXT: jns LBB22_16
3445 ; AVX2-NEXT: LBB22_15: ## %cond.load19
3446 ; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3447 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3448 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
3449 ; AVX2-NEXT: je LBB22_18
3450 ; AVX2-NEXT: LBB22_17: ## %cond.load22
3451 ; AVX2-NEXT: vpbroadcastw 16(%rdi), %ymm0
3452 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
3453 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3454 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
3455 ; AVX2-NEXT: je LBB22_20
3456 ; AVX2-NEXT: LBB22_19: ## %cond.load25
3457 ; AVX2-NEXT: vpbroadcastw 18(%rdi), %ymm0
3458 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
3459 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3460 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
3461 ; AVX2-NEXT: je LBB22_22
3462 ; AVX2-NEXT: LBB22_21: ## %cond.load28
3463 ; AVX2-NEXT: vpbroadcastw 20(%rdi), %ymm0
3464 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
3465 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3466 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
3467 ; AVX2-NEXT: je LBB22_24
3468 ; AVX2-NEXT: LBB22_23: ## %cond.load31
3469 ; AVX2-NEXT: vpbroadcastw 22(%rdi), %ymm0
3470 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
3471 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3472 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
3473 ; AVX2-NEXT: je LBB22_26
3474 ; AVX2-NEXT: LBB22_25: ## %cond.load34
3475 ; AVX2-NEXT: vpbroadcastw 24(%rdi), %ymm0
3476 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
3477 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3478 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
3479 ; AVX2-NEXT: je LBB22_28
3480 ; AVX2-NEXT: LBB22_27: ## %cond.load37
3481 ; AVX2-NEXT: vpbroadcastw 26(%rdi), %ymm0
3482 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
3483 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3484 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
3485 ; AVX2-NEXT: je LBB22_30
3486 ; AVX2-NEXT: LBB22_29: ## %cond.load40
3487 ; AVX2-NEXT: vpbroadcastw 28(%rdi), %ymm0
3488 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
3489 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3490 ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
3491 ; AVX2-NEXT: je LBB22_32
3492 ; AVX2-NEXT: LBB22_31: ## %cond.load43
3493 ; AVX2-NEXT: vpbroadcastw 30(%rdi), %ymm0
3494 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
3495 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3496 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
3499 ; AVX512F-LABEL: load_v16i16_v16i16:
3500 ; AVX512F: ## %bb.0:
3501 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
3502 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
3503 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
3504 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
3505 ; AVX512F-NEXT: kmovw %k0, %eax
3506 ; AVX512F-NEXT: testb $1, %al
3507 ; AVX512F-NEXT: jne LBB22_1
3508 ; AVX512F-NEXT: ## %bb.2: ## %else
3509 ; AVX512F-NEXT: testb $2, %al
3510 ; AVX512F-NEXT: jne LBB22_3
3511 ; AVX512F-NEXT: LBB22_4: ## %else2
3512 ; AVX512F-NEXT: testb $4, %al
3513 ; AVX512F-NEXT: jne LBB22_5
3514 ; AVX512F-NEXT: LBB22_6: ## %else5
3515 ; AVX512F-NEXT: testb $8, %al
3516 ; AVX512F-NEXT: jne LBB22_7
3517 ; AVX512F-NEXT: LBB22_8: ## %else8
3518 ; AVX512F-NEXT: testb $16, %al
3519 ; AVX512F-NEXT: jne LBB22_9
3520 ; AVX512F-NEXT: LBB22_10: ## %else11
3521 ; AVX512F-NEXT: testb $32, %al
3522 ; AVX512F-NEXT: jne LBB22_11
3523 ; AVX512F-NEXT: LBB22_12: ## %else14
3524 ; AVX512F-NEXT: testb $64, %al
3525 ; AVX512F-NEXT: jne LBB22_13
3526 ; AVX512F-NEXT: LBB22_14: ## %else17
3527 ; AVX512F-NEXT: testb %al, %al
3528 ; AVX512F-NEXT: js LBB22_15
3529 ; AVX512F-NEXT: LBB22_16: ## %else20
3530 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
3531 ; AVX512F-NEXT: jne LBB22_17
3532 ; AVX512F-NEXT: LBB22_18: ## %else23
3533 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
3534 ; AVX512F-NEXT: jne LBB22_19
3535 ; AVX512F-NEXT: LBB22_20: ## %else26
3536 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
3537 ; AVX512F-NEXT: jne LBB22_21
3538 ; AVX512F-NEXT: LBB22_22: ## %else29
3539 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
3540 ; AVX512F-NEXT: jne LBB22_23
3541 ; AVX512F-NEXT: LBB22_24: ## %else32
3542 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
3543 ; AVX512F-NEXT: jne LBB22_25
3544 ; AVX512F-NEXT: LBB22_26: ## %else35
3545 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
3546 ; AVX512F-NEXT: jne LBB22_27
3547 ; AVX512F-NEXT: LBB22_28: ## %else38
3548 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
3549 ; AVX512F-NEXT: jne LBB22_29
3550 ; AVX512F-NEXT: LBB22_30: ## %else41
3551 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
3552 ; AVX512F-NEXT: jne LBB22_31
3553 ; AVX512F-NEXT: LBB22_32: ## %else44
3554 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
3555 ; AVX512F-NEXT: retq
3556 ; AVX512F-NEXT: LBB22_1: ## %cond.load
3557 ; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3558 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3559 ; AVX512F-NEXT: testb $2, %al
3560 ; AVX512F-NEXT: je LBB22_4
3561 ; AVX512F-NEXT: LBB22_3: ## %cond.load1
3562 ; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3563 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3564 ; AVX512F-NEXT: testb $4, %al
3565 ; AVX512F-NEXT: je LBB22_6
3566 ; AVX512F-NEXT: LBB22_5: ## %cond.load4
3567 ; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3568 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3569 ; AVX512F-NEXT: testb $8, %al
3570 ; AVX512F-NEXT: je LBB22_8
3571 ; AVX512F-NEXT: LBB22_7: ## %cond.load7
3572 ; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3573 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3574 ; AVX512F-NEXT: testb $16, %al
3575 ; AVX512F-NEXT: je LBB22_10
3576 ; AVX512F-NEXT: LBB22_9: ## %cond.load10
3577 ; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3578 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3579 ; AVX512F-NEXT: testb $32, %al
3580 ; AVX512F-NEXT: je LBB22_12
3581 ; AVX512F-NEXT: LBB22_11: ## %cond.load13
3582 ; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3583 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3584 ; AVX512F-NEXT: testb $64, %al
3585 ; AVX512F-NEXT: je LBB22_14
3586 ; AVX512F-NEXT: LBB22_13: ## %cond.load16
3587 ; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3588 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3589 ; AVX512F-NEXT: testb %al, %al
3590 ; AVX512F-NEXT: jns LBB22_16
3591 ; AVX512F-NEXT: LBB22_15: ## %cond.load19
3592 ; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3593 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3594 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
3595 ; AVX512F-NEXT: je LBB22_18
3596 ; AVX512F-NEXT: LBB22_17: ## %cond.load22
3597 ; AVX512F-NEXT: vpbroadcastw 16(%rdi), %ymm0
3598 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
3599 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3600 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
3601 ; AVX512F-NEXT: je LBB22_20
3602 ; AVX512F-NEXT: LBB22_19: ## %cond.load25
3603 ; AVX512F-NEXT: vpbroadcastw 18(%rdi), %ymm0
3604 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
3605 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3606 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
3607 ; AVX512F-NEXT: je LBB22_22
3608 ; AVX512F-NEXT: LBB22_21: ## %cond.load28
3609 ; AVX512F-NEXT: vpbroadcastw 20(%rdi), %ymm0
3610 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
3611 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3612 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
3613 ; AVX512F-NEXT: je LBB22_24
3614 ; AVX512F-NEXT: LBB22_23: ## %cond.load31
3615 ; AVX512F-NEXT: vpbroadcastw 22(%rdi), %ymm0
3616 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
3617 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3618 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
3619 ; AVX512F-NEXT: je LBB22_26
3620 ; AVX512F-NEXT: LBB22_25: ## %cond.load34
3621 ; AVX512F-NEXT: vpbroadcastw 24(%rdi), %ymm0
3622 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
3623 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3624 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
3625 ; AVX512F-NEXT: je LBB22_28
3626 ; AVX512F-NEXT: LBB22_27: ## %cond.load37
3627 ; AVX512F-NEXT: vpbroadcastw 26(%rdi), %ymm0
3628 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
3629 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3630 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
3631 ; AVX512F-NEXT: je LBB22_30
3632 ; AVX512F-NEXT: LBB22_29: ## %cond.load40
3633 ; AVX512F-NEXT: vpbroadcastw 28(%rdi), %ymm0
3634 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
3635 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3636 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
3637 ; AVX512F-NEXT: je LBB22_32
3638 ; AVX512F-NEXT: LBB22_31: ## %cond.load43
3639 ; AVX512F-NEXT: vpbroadcastw 30(%rdi), %ymm0
3640 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
3641 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3642 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
3643 ; AVX512F-NEXT: retq
3645 ; AVX512VLDQ-LABEL: load_v16i16_v16i16:
3646 ; AVX512VLDQ: ## %bb.0:
3647 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
3648 ; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
3649 ; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0
3650 ; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0
3651 ; AVX512VLDQ-NEXT: kmovw %k0, %eax
3652 ; AVX512VLDQ-NEXT: testb $1, %al
3653 ; AVX512VLDQ-NEXT: jne LBB22_1
3654 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
3655 ; AVX512VLDQ-NEXT: testb $2, %al
3656 ; AVX512VLDQ-NEXT: jne LBB22_3
3657 ; AVX512VLDQ-NEXT: LBB22_4: ## %else2
3658 ; AVX512VLDQ-NEXT: testb $4, %al
3659 ; AVX512VLDQ-NEXT: jne LBB22_5
3660 ; AVX512VLDQ-NEXT: LBB22_6: ## %else5
3661 ; AVX512VLDQ-NEXT: testb $8, %al
3662 ; AVX512VLDQ-NEXT: jne LBB22_7
3663 ; AVX512VLDQ-NEXT: LBB22_8: ## %else8
3664 ; AVX512VLDQ-NEXT: testb $16, %al
3665 ; AVX512VLDQ-NEXT: jne LBB22_9
3666 ; AVX512VLDQ-NEXT: LBB22_10: ## %else11
3667 ; AVX512VLDQ-NEXT: testb $32, %al
3668 ; AVX512VLDQ-NEXT: jne LBB22_11
3669 ; AVX512VLDQ-NEXT: LBB22_12: ## %else14
3670 ; AVX512VLDQ-NEXT: testb $64, %al
3671 ; AVX512VLDQ-NEXT: jne LBB22_13
3672 ; AVX512VLDQ-NEXT: LBB22_14: ## %else17
3673 ; AVX512VLDQ-NEXT: testb %al, %al
3674 ; AVX512VLDQ-NEXT: js LBB22_15
3675 ; AVX512VLDQ-NEXT: LBB22_16: ## %else20
3676 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
3677 ; AVX512VLDQ-NEXT: jne LBB22_17
3678 ; AVX512VLDQ-NEXT: LBB22_18: ## %else23
3679 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
3680 ; AVX512VLDQ-NEXT: jne LBB22_19
3681 ; AVX512VLDQ-NEXT: LBB22_20: ## %else26
3682 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
3683 ; AVX512VLDQ-NEXT: jne LBB22_21
3684 ; AVX512VLDQ-NEXT: LBB22_22: ## %else29
3685 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
3686 ; AVX512VLDQ-NEXT: jne LBB22_23
3687 ; AVX512VLDQ-NEXT: LBB22_24: ## %else32
3688 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
3689 ; AVX512VLDQ-NEXT: jne LBB22_25
3690 ; AVX512VLDQ-NEXT: LBB22_26: ## %else35
3691 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
3692 ; AVX512VLDQ-NEXT: jne LBB22_27
3693 ; AVX512VLDQ-NEXT: LBB22_28: ## %else38
3694 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
3695 ; AVX512VLDQ-NEXT: jne LBB22_29
3696 ; AVX512VLDQ-NEXT: LBB22_30: ## %else41
3697 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
3698 ; AVX512VLDQ-NEXT: jne LBB22_31
3699 ; AVX512VLDQ-NEXT: LBB22_32: ## %else44
3700 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
3701 ; AVX512VLDQ-NEXT: retq
3702 ; AVX512VLDQ-NEXT: LBB22_1: ## %cond.load
3703 ; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm0
3704 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3705 ; AVX512VLDQ-NEXT: testb $2, %al
3706 ; AVX512VLDQ-NEXT: je LBB22_4
3707 ; AVX512VLDQ-NEXT: LBB22_3: ## %cond.load1
3708 ; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm0
3709 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3710 ; AVX512VLDQ-NEXT: testb $4, %al
3711 ; AVX512VLDQ-NEXT: je LBB22_6
3712 ; AVX512VLDQ-NEXT: LBB22_5: ## %cond.load4
3713 ; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm0
3714 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3715 ; AVX512VLDQ-NEXT: testb $8, %al
3716 ; AVX512VLDQ-NEXT: je LBB22_8
3717 ; AVX512VLDQ-NEXT: LBB22_7: ## %cond.load7
3718 ; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm0
3719 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3720 ; AVX512VLDQ-NEXT: testb $16, %al
3721 ; AVX512VLDQ-NEXT: je LBB22_10
3722 ; AVX512VLDQ-NEXT: LBB22_9: ## %cond.load10
3723 ; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm0
3724 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3725 ; AVX512VLDQ-NEXT: testb $32, %al
3726 ; AVX512VLDQ-NEXT: je LBB22_12
3727 ; AVX512VLDQ-NEXT: LBB22_11: ## %cond.load13
3728 ; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm0
3729 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3730 ; AVX512VLDQ-NEXT: testb $64, %al
3731 ; AVX512VLDQ-NEXT: je LBB22_14
3732 ; AVX512VLDQ-NEXT: LBB22_13: ## %cond.load16
3733 ; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm0
3734 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3735 ; AVX512VLDQ-NEXT: testb %al, %al
3736 ; AVX512VLDQ-NEXT: jns LBB22_16
3737 ; AVX512VLDQ-NEXT: LBB22_15: ## %cond.load19
3738 ; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm0
3739 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3740 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
3741 ; AVX512VLDQ-NEXT: je LBB22_18
3742 ; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22
3743 ; AVX512VLDQ-NEXT: vpbroadcastw 16(%rdi), %ymm0
3744 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
3745 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3746 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
3747 ; AVX512VLDQ-NEXT: je LBB22_20
3748 ; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25
3749 ; AVX512VLDQ-NEXT: vpbroadcastw 18(%rdi), %ymm0
3750 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
3751 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3752 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
3753 ; AVX512VLDQ-NEXT: je LBB22_22
3754 ; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28
3755 ; AVX512VLDQ-NEXT: vpbroadcastw 20(%rdi), %ymm0
3756 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
3757 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3758 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
3759 ; AVX512VLDQ-NEXT: je LBB22_24
3760 ; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31
3761 ; AVX512VLDQ-NEXT: vpbroadcastw 22(%rdi), %ymm0
3762 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
3763 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3764 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
3765 ; AVX512VLDQ-NEXT: je LBB22_26
3766 ; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34
3767 ; AVX512VLDQ-NEXT: vpbroadcastw 24(%rdi), %ymm0
3768 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
3769 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3770 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
3771 ; AVX512VLDQ-NEXT: je LBB22_28
3772 ; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37
3773 ; AVX512VLDQ-NEXT: vpbroadcastw 26(%rdi), %ymm0
3774 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
3775 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3776 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
3777 ; AVX512VLDQ-NEXT: je LBB22_30
3778 ; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40
3779 ; AVX512VLDQ-NEXT: vpbroadcastw 28(%rdi), %ymm0
3780 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
3781 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3782 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
3783 ; AVX512VLDQ-NEXT: je LBB22_32
3784 ; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43
3785 ; AVX512VLDQ-NEXT: vpbroadcastw 30(%rdi), %ymm0
3786 ; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
3787 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3788 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
3789 ; AVX512VLDQ-NEXT: retq
3791 ; AVX512VLBW-LABEL: load_v16i16_v16i16:
3792 ; AVX512VLBW: ## %bb.0:
3793 ; AVX512VLBW-NEXT: vpmovw2m %ymm0, %k1
3794 ; AVX512VLBW-NEXT: vpblendmw (%rdi), %ymm1, %ymm0 {%k1}
3795 ; AVX512VLBW-NEXT: retq
3797 ; X86-AVX512-LABEL: load_v16i16_v16i16:
3798 ; X86-AVX512: ## %bb.0:
3799 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
3800 ; X86-AVX512-NEXT: vpmovw2m %ymm0, %k1
3801 ; X86-AVX512-NEXT: vpblendmw (%eax), %ymm1, %ymm0 {%k1}
3802 ; X86-AVX512-NEXT: retl
3803 %mask = icmp slt <16 x i16> %trigger, zeroinitializer
3804 %res = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x i16> %dst)
3812 define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %dst) {
3813 ; SSE2-LABEL: load_v16i8_v16i8:
3815 ; SSE2-NEXT: pmovmskb %xmm0, %eax
3816 ; SSE2-NEXT: testb $1, %al
3817 ; SSE2-NEXT: jne LBB23_1
3818 ; SSE2-NEXT: ## %bb.2: ## %else
3819 ; SSE2-NEXT: testb $2, %al
3820 ; SSE2-NEXT: jne LBB23_3
3821 ; SSE2-NEXT: LBB23_4: ## %else2
3822 ; SSE2-NEXT: testb $4, %al
3823 ; SSE2-NEXT: jne LBB23_5
3824 ; SSE2-NEXT: LBB23_6: ## %else5
3825 ; SSE2-NEXT: testb $8, %al
3826 ; SSE2-NEXT: jne LBB23_7
3827 ; SSE2-NEXT: LBB23_8: ## %else8
3828 ; SSE2-NEXT: testb $16, %al
3829 ; SSE2-NEXT: jne LBB23_9
3830 ; SSE2-NEXT: LBB23_10: ## %else11
3831 ; SSE2-NEXT: testb $32, %al
3832 ; SSE2-NEXT: jne LBB23_11
3833 ; SSE2-NEXT: LBB23_12: ## %else14
3834 ; SSE2-NEXT: testb $64, %al
3835 ; SSE2-NEXT: jne LBB23_13
3836 ; SSE2-NEXT: LBB23_14: ## %else17
3837 ; SSE2-NEXT: testb %al, %al
3838 ; SSE2-NEXT: js LBB23_15
3839 ; SSE2-NEXT: LBB23_16: ## %else20
3840 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
3841 ; SSE2-NEXT: jne LBB23_17
3842 ; SSE2-NEXT: LBB23_18: ## %else23
3843 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
3844 ; SSE2-NEXT: jne LBB23_19
3845 ; SSE2-NEXT: LBB23_20: ## %else26
3846 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
3847 ; SSE2-NEXT: jne LBB23_21
3848 ; SSE2-NEXT: LBB23_22: ## %else29
3849 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
3850 ; SSE2-NEXT: jne LBB23_23
3851 ; SSE2-NEXT: LBB23_24: ## %else32
3852 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
3853 ; SSE2-NEXT: jne LBB23_25
3854 ; SSE2-NEXT: LBB23_26: ## %else35
3855 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
3856 ; SSE2-NEXT: jne LBB23_27
3857 ; SSE2-NEXT: LBB23_28: ## %else38
3858 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
3859 ; SSE2-NEXT: jne LBB23_29
3860 ; SSE2-NEXT: LBB23_30: ## %else41
3861 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
3862 ; SSE2-NEXT: jne LBB23_31
3863 ; SSE2-NEXT: LBB23_32: ## %else44
3864 ; SSE2-NEXT: movdqa %xmm1, %xmm0
3866 ; SSE2-NEXT: LBB23_1: ## %cond.load
3867 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3868 ; SSE2-NEXT: pand %xmm0, %xmm1
3869 ; SSE2-NEXT: movzbl (%rdi), %ecx
3870 ; SSE2-NEXT: movd %ecx, %xmm2
3871 ; SSE2-NEXT: pandn %xmm2, %xmm0
3872 ; SSE2-NEXT: por %xmm0, %xmm1
3873 ; SSE2-NEXT: testb $2, %al
3874 ; SSE2-NEXT: je LBB23_4
3875 ; SSE2-NEXT: LBB23_3: ## %cond.load1
3876 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3877 ; SSE2-NEXT: pand %xmm0, %xmm1
3878 ; SSE2-NEXT: movzbl 1(%rdi), %ecx
3879 ; SSE2-NEXT: movd %ecx, %xmm2
3880 ; SSE2-NEXT: psllw $8, %xmm2
3881 ; SSE2-NEXT: pandn %xmm2, %xmm0
3882 ; SSE2-NEXT: por %xmm0, %xmm1
3883 ; SSE2-NEXT: testb $4, %al
3884 ; SSE2-NEXT: je LBB23_6
3885 ; SSE2-NEXT: LBB23_5: ## %cond.load4
3886 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
3887 ; SSE2-NEXT: pand %xmm0, %xmm1
3888 ; SSE2-NEXT: movzbl 2(%rdi), %ecx
3889 ; SSE2-NEXT: movd %ecx, %xmm2
3890 ; SSE2-NEXT: pslld $16, %xmm2
3891 ; SSE2-NEXT: pandn %xmm2, %xmm0
3892 ; SSE2-NEXT: por %xmm0, %xmm1
3893 ; SSE2-NEXT: testb $8, %al
3894 ; SSE2-NEXT: je LBB23_8
3895 ; SSE2-NEXT: LBB23_7: ## %cond.load7
3896 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
3897 ; SSE2-NEXT: pand %xmm0, %xmm1
3898 ; SSE2-NEXT: movzbl 3(%rdi), %ecx
3899 ; SSE2-NEXT: movd %ecx, %xmm2
3900 ; SSE2-NEXT: pslld $24, %xmm2
3901 ; SSE2-NEXT: pandn %xmm2, %xmm0
3902 ; SSE2-NEXT: por %xmm0, %xmm1
3903 ; SSE2-NEXT: testb $16, %al
3904 ; SSE2-NEXT: je LBB23_10
3905 ; SSE2-NEXT: LBB23_9: ## %cond.load10
3906 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
3907 ; SSE2-NEXT: pand %xmm0, %xmm1
3908 ; SSE2-NEXT: movzbl 4(%rdi), %ecx
3909 ; SSE2-NEXT: movd %ecx, %xmm2
3910 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
3911 ; SSE2-NEXT: pandn %xmm2, %xmm0
3912 ; SSE2-NEXT: por %xmm0, %xmm1
3913 ; SSE2-NEXT: testb $32, %al
3914 ; SSE2-NEXT: je LBB23_12
3915 ; SSE2-NEXT: LBB23_11: ## %cond.load13
3916 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
3917 ; SSE2-NEXT: pand %xmm0, %xmm1
3918 ; SSE2-NEXT: movzbl 5(%rdi), %ecx
3919 ; SSE2-NEXT: movd %ecx, %xmm2
3920 ; SSE2-NEXT: psllq $40, %xmm2
3921 ; SSE2-NEXT: pandn %xmm2, %xmm0
3922 ; SSE2-NEXT: por %xmm0, %xmm1
3923 ; SSE2-NEXT: testb $64, %al
3924 ; SSE2-NEXT: je LBB23_14
3925 ; SSE2-NEXT: LBB23_13: ## %cond.load16
3926 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
3927 ; SSE2-NEXT: pand %xmm0, %xmm1
3928 ; SSE2-NEXT: movzbl 6(%rdi), %ecx
3929 ; SSE2-NEXT: movd %ecx, %xmm2
3930 ; SSE2-NEXT: psllq $48, %xmm2
3931 ; SSE2-NEXT: pandn %xmm2, %xmm0
3932 ; SSE2-NEXT: por %xmm0, %xmm1
3933 ; SSE2-NEXT: testb %al, %al
3934 ; SSE2-NEXT: jns LBB23_16
3935 ; SSE2-NEXT: LBB23_15: ## %cond.load19
3936 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
3937 ; SSE2-NEXT: pand %xmm0, %xmm1
3938 ; SSE2-NEXT: movzbl 7(%rdi), %ecx
3939 ; SSE2-NEXT: movd %ecx, %xmm2
3940 ; SSE2-NEXT: psllq $56, %xmm2
3941 ; SSE2-NEXT: pandn %xmm2, %xmm0
3942 ; SSE2-NEXT: por %xmm0, %xmm1
3943 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
3944 ; SSE2-NEXT: je LBB23_18
3945 ; SSE2-NEXT: LBB23_17: ## %cond.load22
3946 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3947 ; SSE2-NEXT: pand %xmm0, %xmm1
3948 ; SSE2-NEXT: movzbl 8(%rdi), %ecx
3949 ; SSE2-NEXT: movd %ecx, %xmm2
3950 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
3951 ; SSE2-NEXT: pandn %xmm2, %xmm0
3952 ; SSE2-NEXT: por %xmm0, %xmm1
3953 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
3954 ; SSE2-NEXT: je LBB23_20
3955 ; SSE2-NEXT: LBB23_19: ## %cond.load25
3956 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
3957 ; SSE2-NEXT: pand %xmm0, %xmm1
3958 ; SSE2-NEXT: movzbl 9(%rdi), %ecx
3959 ; SSE2-NEXT: movd %ecx, %xmm2
3960 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
3961 ; SSE2-NEXT: pandn %xmm2, %xmm0
3962 ; SSE2-NEXT: por %xmm0, %xmm1
3963 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
3964 ; SSE2-NEXT: je LBB23_22
3965 ; SSE2-NEXT: LBB23_21: ## %cond.load28
3966 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
3967 ; SSE2-NEXT: pand %xmm0, %xmm1
3968 ; SSE2-NEXT: movzbl 10(%rdi), %ecx
3969 ; SSE2-NEXT: movd %ecx, %xmm2
3970 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
3971 ; SSE2-NEXT: pandn %xmm2, %xmm0
3972 ; SSE2-NEXT: por %xmm0, %xmm1
3973 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
3974 ; SSE2-NEXT: je LBB23_24
3975 ; SSE2-NEXT: LBB23_23: ## %cond.load31
3976 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
3977 ; SSE2-NEXT: pand %xmm0, %xmm1
3978 ; SSE2-NEXT: movzbl 11(%rdi), %ecx
3979 ; SSE2-NEXT: movd %ecx, %xmm2
3980 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
3981 ; SSE2-NEXT: pandn %xmm2, %xmm0
3982 ; SSE2-NEXT: por %xmm0, %xmm1
3983 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
3984 ; SSE2-NEXT: je LBB23_26
3985 ; SSE2-NEXT: LBB23_25: ## %cond.load34
3986 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3987 ; SSE2-NEXT: pand %xmm0, %xmm1
3988 ; SSE2-NEXT: movzbl 12(%rdi), %ecx
3989 ; SSE2-NEXT: movd %ecx, %xmm2
3990 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
3991 ; SSE2-NEXT: pandn %xmm2, %xmm0
3992 ; SSE2-NEXT: por %xmm0, %xmm1
3993 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
3994 ; SSE2-NEXT: je LBB23_28
3995 ; SSE2-NEXT: LBB23_27: ## %cond.load37
3996 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
3997 ; SSE2-NEXT: pand %xmm0, %xmm1
3998 ; SSE2-NEXT: movzbl 13(%rdi), %ecx
3999 ; SSE2-NEXT: movd %ecx, %xmm2
4000 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
4001 ; SSE2-NEXT: pandn %xmm2, %xmm0
4002 ; SSE2-NEXT: por %xmm0, %xmm1
4003 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
4004 ; SSE2-NEXT: je LBB23_30
4005 ; SSE2-NEXT: LBB23_29: ## %cond.load40
4006 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4007 ; SSE2-NEXT: pand %xmm0, %xmm1
4008 ; SSE2-NEXT: movzbl 14(%rdi), %ecx
4009 ; SSE2-NEXT: movd %ecx, %xmm2
4010 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
4011 ; SSE2-NEXT: pandn %xmm2, %xmm0
4012 ; SSE2-NEXT: por %xmm0, %xmm1
4013 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000
4014 ; SSE2-NEXT: je LBB23_32
4015 ; SSE2-NEXT: LBB23_31: ## %cond.load43
4016 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
4017 ; SSE2-NEXT: movzbl 15(%rdi), %eax
4018 ; SSE2-NEXT: movd %eax, %xmm0
4019 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4020 ; SSE2-NEXT: por %xmm0, %xmm1
4021 ; SSE2-NEXT: movdqa %xmm1, %xmm0
4024 ; SSE42-LABEL: load_v16i8_v16i8:
4026 ; SSE42-NEXT: pmovmskb %xmm0, %eax
4027 ; SSE42-NEXT: testb $1, %al
4028 ; SSE42-NEXT: jne LBB23_1
4029 ; SSE42-NEXT: ## %bb.2: ## %else
4030 ; SSE42-NEXT: testb $2, %al
4031 ; SSE42-NEXT: jne LBB23_3
4032 ; SSE42-NEXT: LBB23_4: ## %else2
4033 ; SSE42-NEXT: testb $4, %al
4034 ; SSE42-NEXT: jne LBB23_5
4035 ; SSE42-NEXT: LBB23_6: ## %else5
4036 ; SSE42-NEXT: testb $8, %al
4037 ; SSE42-NEXT: jne LBB23_7
4038 ; SSE42-NEXT: LBB23_8: ## %else8
4039 ; SSE42-NEXT: testb $16, %al
4040 ; SSE42-NEXT: jne LBB23_9
4041 ; SSE42-NEXT: LBB23_10: ## %else11
4042 ; SSE42-NEXT: testb $32, %al
4043 ; SSE42-NEXT: jne LBB23_11
4044 ; SSE42-NEXT: LBB23_12: ## %else14
4045 ; SSE42-NEXT: testb $64, %al
4046 ; SSE42-NEXT: jne LBB23_13
4047 ; SSE42-NEXT: LBB23_14: ## %else17
4048 ; SSE42-NEXT: testb %al, %al
4049 ; SSE42-NEXT: js LBB23_15
4050 ; SSE42-NEXT: LBB23_16: ## %else20
4051 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
4052 ; SSE42-NEXT: jne LBB23_17
4053 ; SSE42-NEXT: LBB23_18: ## %else23
4054 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
4055 ; SSE42-NEXT: jne LBB23_19
4056 ; SSE42-NEXT: LBB23_20: ## %else26
4057 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
4058 ; SSE42-NEXT: jne LBB23_21
4059 ; SSE42-NEXT: LBB23_22: ## %else29
4060 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
4061 ; SSE42-NEXT: jne LBB23_23
4062 ; SSE42-NEXT: LBB23_24: ## %else32
4063 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
4064 ; SSE42-NEXT: jne LBB23_25
4065 ; SSE42-NEXT: LBB23_26: ## %else35
4066 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
4067 ; SSE42-NEXT: jne LBB23_27
4068 ; SSE42-NEXT: LBB23_28: ## %else38
4069 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
4070 ; SSE42-NEXT: jne LBB23_29
4071 ; SSE42-NEXT: LBB23_30: ## %else41
4072 ; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
4073 ; SSE42-NEXT: jne LBB23_31
4074 ; SSE42-NEXT: LBB23_32: ## %else44
4075 ; SSE42-NEXT: movdqa %xmm1, %xmm0
4077 ; SSE42-NEXT: LBB23_1: ## %cond.load
4078 ; SSE42-NEXT: pinsrb $0, (%rdi), %xmm1
4079 ; SSE42-NEXT: testb $2, %al
4080 ; SSE42-NEXT: je LBB23_4
4081 ; SSE42-NEXT: LBB23_3: ## %cond.load1
4082 ; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm1
4083 ; SSE42-NEXT: testb $4, %al
4084 ; SSE42-NEXT: je LBB23_6
4085 ; SSE42-NEXT: LBB23_5: ## %cond.load4
4086 ; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm1
4087 ; SSE42-NEXT: testb $8, %al
4088 ; SSE42-NEXT: je LBB23_8
4089 ; SSE42-NEXT: LBB23_7: ## %cond.load7
4090 ; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm1
4091 ; SSE42-NEXT: testb $16, %al
4092 ; SSE42-NEXT: je LBB23_10
4093 ; SSE42-NEXT: LBB23_9: ## %cond.load10
4094 ; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm1
4095 ; SSE42-NEXT: testb $32, %al
4096 ; SSE42-NEXT: je LBB23_12
4097 ; SSE42-NEXT: LBB23_11: ## %cond.load13
4098 ; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm1
4099 ; SSE42-NEXT: testb $64, %al
4100 ; SSE42-NEXT: je LBB23_14
4101 ; SSE42-NEXT: LBB23_13: ## %cond.load16
4102 ; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm1
4103 ; SSE42-NEXT: testb %al, %al
4104 ; SSE42-NEXT: jns LBB23_16
4105 ; SSE42-NEXT: LBB23_15: ## %cond.load19
4106 ; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm1
4107 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
4108 ; SSE42-NEXT: je LBB23_18
4109 ; SSE42-NEXT: LBB23_17: ## %cond.load22
4110 ; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm1
4111 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
4112 ; SSE42-NEXT: je LBB23_20
4113 ; SSE42-NEXT: LBB23_19: ## %cond.load25
4114 ; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm1
4115 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
4116 ; SSE42-NEXT: je LBB23_22
4117 ; SSE42-NEXT: LBB23_21: ## %cond.load28
4118 ; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm1
4119 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
4120 ; SSE42-NEXT: je LBB23_24
4121 ; SSE42-NEXT: LBB23_23: ## %cond.load31
4122 ; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm1
4123 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
4124 ; SSE42-NEXT: je LBB23_26
4125 ; SSE42-NEXT: LBB23_25: ## %cond.load34
4126 ; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm1
4127 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
4128 ; SSE42-NEXT: je LBB23_28
4129 ; SSE42-NEXT: LBB23_27: ## %cond.load37
4130 ; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm1
4131 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
4132 ; SSE42-NEXT: je LBB23_30
4133 ; SSE42-NEXT: LBB23_29: ## %cond.load40
4134 ; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm1
4135 ; SSE42-NEXT: testl $32768, %eax ## imm = 0x8000
4136 ; SSE42-NEXT: je LBB23_32
4137 ; SSE42-NEXT: LBB23_31: ## %cond.load43
4138 ; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm1
4139 ; SSE42-NEXT: movdqa %xmm1, %xmm0
4142 ; AVX1OR2-LABEL: load_v16i8_v16i8:
4143 ; AVX1OR2: ## %bb.0:
4144 ; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
4145 ; AVX1OR2-NEXT: testb $1, %al
4146 ; AVX1OR2-NEXT: jne LBB23_1
4147 ; AVX1OR2-NEXT: ## %bb.2: ## %else
4148 ; AVX1OR2-NEXT: testb $2, %al
4149 ; AVX1OR2-NEXT: jne LBB23_3
4150 ; AVX1OR2-NEXT: LBB23_4: ## %else2
4151 ; AVX1OR2-NEXT: testb $4, %al
4152 ; AVX1OR2-NEXT: jne LBB23_5
4153 ; AVX1OR2-NEXT: LBB23_6: ## %else5
4154 ; AVX1OR2-NEXT: testb $8, %al
4155 ; AVX1OR2-NEXT: jne LBB23_7
4156 ; AVX1OR2-NEXT: LBB23_8: ## %else8
4157 ; AVX1OR2-NEXT: testb $16, %al
4158 ; AVX1OR2-NEXT: jne LBB23_9
4159 ; AVX1OR2-NEXT: LBB23_10: ## %else11
4160 ; AVX1OR2-NEXT: testb $32, %al
4161 ; AVX1OR2-NEXT: jne LBB23_11
4162 ; AVX1OR2-NEXT: LBB23_12: ## %else14
4163 ; AVX1OR2-NEXT: testb $64, %al
4164 ; AVX1OR2-NEXT: jne LBB23_13
4165 ; AVX1OR2-NEXT: LBB23_14: ## %else17
4166 ; AVX1OR2-NEXT: testb %al, %al
4167 ; AVX1OR2-NEXT: js LBB23_15
4168 ; AVX1OR2-NEXT: LBB23_16: ## %else20
4169 ; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
4170 ; AVX1OR2-NEXT: jne LBB23_17
4171 ; AVX1OR2-NEXT: LBB23_18: ## %else23
4172 ; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
4173 ; AVX1OR2-NEXT: jne LBB23_19
4174 ; AVX1OR2-NEXT: LBB23_20: ## %else26
4175 ; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400
4176 ; AVX1OR2-NEXT: jne LBB23_21
4177 ; AVX1OR2-NEXT: LBB23_22: ## %else29
4178 ; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
4179 ; AVX1OR2-NEXT: jne LBB23_23
4180 ; AVX1OR2-NEXT: LBB23_24: ## %else32
4181 ; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
4182 ; AVX1OR2-NEXT: jne LBB23_25
4183 ; AVX1OR2-NEXT: LBB23_26: ## %else35
4184 ; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
4185 ; AVX1OR2-NEXT: jne LBB23_27
4186 ; AVX1OR2-NEXT: LBB23_28: ## %else38
4187 ; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000
4188 ; AVX1OR2-NEXT: jne LBB23_29
4189 ; AVX1OR2-NEXT: LBB23_30: ## %else41
4190 ; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
4191 ; AVX1OR2-NEXT: jne LBB23_31
4192 ; AVX1OR2-NEXT: LBB23_32: ## %else44
4193 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
4194 ; AVX1OR2-NEXT: retq
4195 ; AVX1OR2-NEXT: LBB23_1: ## %cond.load
4196 ; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
4197 ; AVX1OR2-NEXT: testb $2, %al
4198 ; AVX1OR2-NEXT: je LBB23_4
4199 ; AVX1OR2-NEXT: LBB23_3: ## %cond.load1
4200 ; AVX1OR2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4201 ; AVX1OR2-NEXT: testb $4, %al
4202 ; AVX1OR2-NEXT: je LBB23_6
4203 ; AVX1OR2-NEXT: LBB23_5: ## %cond.load4
4204 ; AVX1OR2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4205 ; AVX1OR2-NEXT: testb $8, %al
4206 ; AVX1OR2-NEXT: je LBB23_8
4207 ; AVX1OR2-NEXT: LBB23_7: ## %cond.load7
4208 ; AVX1OR2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4209 ; AVX1OR2-NEXT: testb $16, %al
4210 ; AVX1OR2-NEXT: je LBB23_10
4211 ; AVX1OR2-NEXT: LBB23_9: ## %cond.load10
4212 ; AVX1OR2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4213 ; AVX1OR2-NEXT: testb $32, %al
4214 ; AVX1OR2-NEXT: je LBB23_12
4215 ; AVX1OR2-NEXT: LBB23_11: ## %cond.load13
4216 ; AVX1OR2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4217 ; AVX1OR2-NEXT: testb $64, %al
4218 ; AVX1OR2-NEXT: je LBB23_14
4219 ; AVX1OR2-NEXT: LBB23_13: ## %cond.load16
4220 ; AVX1OR2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4221 ; AVX1OR2-NEXT: testb %al, %al
4222 ; AVX1OR2-NEXT: jns LBB23_16
4223 ; AVX1OR2-NEXT: LBB23_15: ## %cond.load19
4224 ; AVX1OR2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4225 ; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
4226 ; AVX1OR2-NEXT: je LBB23_18
4227 ; AVX1OR2-NEXT: LBB23_17: ## %cond.load22
4228 ; AVX1OR2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4229 ; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
4230 ; AVX1OR2-NEXT: je LBB23_20
4231 ; AVX1OR2-NEXT: LBB23_19: ## %cond.load25
4232 ; AVX1OR2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4233 ; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400
4234 ; AVX1OR2-NEXT: je LBB23_22
4235 ; AVX1OR2-NEXT: LBB23_21: ## %cond.load28
4236 ; AVX1OR2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4237 ; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
4238 ; AVX1OR2-NEXT: je LBB23_24
4239 ; AVX1OR2-NEXT: LBB23_23: ## %cond.load31
4240 ; AVX1OR2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4241 ; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
4242 ; AVX1OR2-NEXT: je LBB23_26
4243 ; AVX1OR2-NEXT: LBB23_25: ## %cond.load34
4244 ; AVX1OR2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4245 ; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
4246 ; AVX1OR2-NEXT: je LBB23_28
4247 ; AVX1OR2-NEXT: LBB23_27: ## %cond.load37
4248 ; AVX1OR2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4249 ; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000
4250 ; AVX1OR2-NEXT: je LBB23_30
4251 ; AVX1OR2-NEXT: LBB23_29: ## %cond.load40
4252 ; AVX1OR2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4253 ; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
4254 ; AVX1OR2-NEXT: je LBB23_32
4255 ; AVX1OR2-NEXT: LBB23_31: ## %cond.load43
4256 ; AVX1OR2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4257 ; AVX1OR2-NEXT: vmovdqa %xmm1, %xmm0
4258 ; AVX1OR2-NEXT: retq
4260 ; AVX512F-LABEL: load_v16i8_v16i8:
4261 ; AVX512F: ## %bb.0:
4262 ; AVX512F-NEXT: vpmovmskb %xmm0, %eax
4263 ; AVX512F-NEXT: testb $1, %al
4264 ; AVX512F-NEXT: jne LBB23_1
4265 ; AVX512F-NEXT: ## %bb.2: ## %else
4266 ; AVX512F-NEXT: testb $2, %al
4267 ; AVX512F-NEXT: jne LBB23_3
4268 ; AVX512F-NEXT: LBB23_4: ## %else2
4269 ; AVX512F-NEXT: testb $4, %al
4270 ; AVX512F-NEXT: jne LBB23_5
4271 ; AVX512F-NEXT: LBB23_6: ## %else5
4272 ; AVX512F-NEXT: testb $8, %al
4273 ; AVX512F-NEXT: jne LBB23_7
4274 ; AVX512F-NEXT: LBB23_8: ## %else8
4275 ; AVX512F-NEXT: testb $16, %al
4276 ; AVX512F-NEXT: jne LBB23_9
4277 ; AVX512F-NEXT: LBB23_10: ## %else11
4278 ; AVX512F-NEXT: testb $32, %al
4279 ; AVX512F-NEXT: jne LBB23_11
4280 ; AVX512F-NEXT: LBB23_12: ## %else14
4281 ; AVX512F-NEXT: testb $64, %al
4282 ; AVX512F-NEXT: jne LBB23_13
4283 ; AVX512F-NEXT: LBB23_14: ## %else17
4284 ; AVX512F-NEXT: testb %al, %al
4285 ; AVX512F-NEXT: js LBB23_15
4286 ; AVX512F-NEXT: LBB23_16: ## %else20
4287 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
4288 ; AVX512F-NEXT: jne LBB23_17
4289 ; AVX512F-NEXT: LBB23_18: ## %else23
4290 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
4291 ; AVX512F-NEXT: jne LBB23_19
4292 ; AVX512F-NEXT: LBB23_20: ## %else26
4293 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
4294 ; AVX512F-NEXT: jne LBB23_21
4295 ; AVX512F-NEXT: LBB23_22: ## %else29
4296 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
4297 ; AVX512F-NEXT: jne LBB23_23
4298 ; AVX512F-NEXT: LBB23_24: ## %else32
4299 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
4300 ; AVX512F-NEXT: jne LBB23_25
4301 ; AVX512F-NEXT: LBB23_26: ## %else35
4302 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
4303 ; AVX512F-NEXT: jne LBB23_27
4304 ; AVX512F-NEXT: LBB23_28: ## %else38
4305 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
4306 ; AVX512F-NEXT: jne LBB23_29
4307 ; AVX512F-NEXT: LBB23_30: ## %else41
4308 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
4309 ; AVX512F-NEXT: jne LBB23_31
4310 ; AVX512F-NEXT: LBB23_32: ## %else44
4311 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
4312 ; AVX512F-NEXT: retq
4313 ; AVX512F-NEXT: LBB23_1: ## %cond.load
4314 ; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
4315 ; AVX512F-NEXT: testb $2, %al
4316 ; AVX512F-NEXT: je LBB23_4
4317 ; AVX512F-NEXT: LBB23_3: ## %cond.load1
4318 ; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4319 ; AVX512F-NEXT: testb $4, %al
4320 ; AVX512F-NEXT: je LBB23_6
4321 ; AVX512F-NEXT: LBB23_5: ## %cond.load4
4322 ; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4323 ; AVX512F-NEXT: testb $8, %al
4324 ; AVX512F-NEXT: je LBB23_8
4325 ; AVX512F-NEXT: LBB23_7: ## %cond.load7
4326 ; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4327 ; AVX512F-NEXT: testb $16, %al
4328 ; AVX512F-NEXT: je LBB23_10
4329 ; AVX512F-NEXT: LBB23_9: ## %cond.load10
4330 ; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4331 ; AVX512F-NEXT: testb $32, %al
4332 ; AVX512F-NEXT: je LBB23_12
4333 ; AVX512F-NEXT: LBB23_11: ## %cond.load13
4334 ; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4335 ; AVX512F-NEXT: testb $64, %al
4336 ; AVX512F-NEXT: je LBB23_14
4337 ; AVX512F-NEXT: LBB23_13: ## %cond.load16
4338 ; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4339 ; AVX512F-NEXT: testb %al, %al
4340 ; AVX512F-NEXT: jns LBB23_16
4341 ; AVX512F-NEXT: LBB23_15: ## %cond.load19
4342 ; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4343 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
4344 ; AVX512F-NEXT: je LBB23_18
4345 ; AVX512F-NEXT: LBB23_17: ## %cond.load22
4346 ; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4347 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
4348 ; AVX512F-NEXT: je LBB23_20
4349 ; AVX512F-NEXT: LBB23_19: ## %cond.load25
4350 ; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4351 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
4352 ; AVX512F-NEXT: je LBB23_22
4353 ; AVX512F-NEXT: LBB23_21: ## %cond.load28
4354 ; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4355 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
4356 ; AVX512F-NEXT: je LBB23_24
4357 ; AVX512F-NEXT: LBB23_23: ## %cond.load31
4358 ; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4359 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
4360 ; AVX512F-NEXT: je LBB23_26
4361 ; AVX512F-NEXT: LBB23_25: ## %cond.load34
4362 ; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4363 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
4364 ; AVX512F-NEXT: je LBB23_28
4365 ; AVX512F-NEXT: LBB23_27: ## %cond.load37
4366 ; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4367 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
4368 ; AVX512F-NEXT: je LBB23_30
4369 ; AVX512F-NEXT: LBB23_29: ## %cond.load40
4370 ; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4371 ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000
4372 ; AVX512F-NEXT: je LBB23_32
4373 ; AVX512F-NEXT: LBB23_31: ## %cond.load43
4374 ; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4375 ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
4376 ; AVX512F-NEXT: retq
4378 ; AVX512VLDQ-LABEL: load_v16i8_v16i8:
4379 ; AVX512VLDQ: ## %bb.0:
4380 ; AVX512VLDQ-NEXT: vpmovmskb %xmm0, %eax
4381 ; AVX512VLDQ-NEXT: testb $1, %al
4382 ; AVX512VLDQ-NEXT: jne LBB23_1
4383 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
4384 ; AVX512VLDQ-NEXT: testb $2, %al
4385 ; AVX512VLDQ-NEXT: jne LBB23_3
4386 ; AVX512VLDQ-NEXT: LBB23_4: ## %else2
4387 ; AVX512VLDQ-NEXT: testb $4, %al
4388 ; AVX512VLDQ-NEXT: jne LBB23_5
4389 ; AVX512VLDQ-NEXT: LBB23_6: ## %else5
4390 ; AVX512VLDQ-NEXT: testb $8, %al
4391 ; AVX512VLDQ-NEXT: jne LBB23_7
4392 ; AVX512VLDQ-NEXT: LBB23_8: ## %else8
4393 ; AVX512VLDQ-NEXT: testb $16, %al
4394 ; AVX512VLDQ-NEXT: jne LBB23_9
4395 ; AVX512VLDQ-NEXT: LBB23_10: ## %else11
4396 ; AVX512VLDQ-NEXT: testb $32, %al
4397 ; AVX512VLDQ-NEXT: jne LBB23_11
4398 ; AVX512VLDQ-NEXT: LBB23_12: ## %else14
4399 ; AVX512VLDQ-NEXT: testb $64, %al
4400 ; AVX512VLDQ-NEXT: jne LBB23_13
4401 ; AVX512VLDQ-NEXT: LBB23_14: ## %else17
4402 ; AVX512VLDQ-NEXT: testb %al, %al
4403 ; AVX512VLDQ-NEXT: js LBB23_15
4404 ; AVX512VLDQ-NEXT: LBB23_16: ## %else20
4405 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
4406 ; AVX512VLDQ-NEXT: jne LBB23_17
4407 ; AVX512VLDQ-NEXT: LBB23_18: ## %else23
4408 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
4409 ; AVX512VLDQ-NEXT: jne LBB23_19
4410 ; AVX512VLDQ-NEXT: LBB23_20: ## %else26
4411 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
4412 ; AVX512VLDQ-NEXT: jne LBB23_21
4413 ; AVX512VLDQ-NEXT: LBB23_22: ## %else29
4414 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
4415 ; AVX512VLDQ-NEXT: jne LBB23_23
4416 ; AVX512VLDQ-NEXT: LBB23_24: ## %else32
4417 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
4418 ; AVX512VLDQ-NEXT: jne LBB23_25
4419 ; AVX512VLDQ-NEXT: LBB23_26: ## %else35
4420 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
4421 ; AVX512VLDQ-NEXT: jne LBB23_27
4422 ; AVX512VLDQ-NEXT: LBB23_28: ## %else38
4423 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
4424 ; AVX512VLDQ-NEXT: jne LBB23_29
4425 ; AVX512VLDQ-NEXT: LBB23_30: ## %else41
4426 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
4427 ; AVX512VLDQ-NEXT: jne LBB23_31
4428 ; AVX512VLDQ-NEXT: LBB23_32: ## %else44
4429 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
4430 ; AVX512VLDQ-NEXT: retq
4431 ; AVX512VLDQ-NEXT: LBB23_1: ## %cond.load
4432 ; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1
4433 ; AVX512VLDQ-NEXT: testb $2, %al
4434 ; AVX512VLDQ-NEXT: je LBB23_4
4435 ; AVX512VLDQ-NEXT: LBB23_3: ## %cond.load1
4436 ; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
4437 ; AVX512VLDQ-NEXT: testb $4, %al
4438 ; AVX512VLDQ-NEXT: je LBB23_6
4439 ; AVX512VLDQ-NEXT: LBB23_5: ## %cond.load4
4440 ; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
4441 ; AVX512VLDQ-NEXT: testb $8, %al
4442 ; AVX512VLDQ-NEXT: je LBB23_8
4443 ; AVX512VLDQ-NEXT: LBB23_7: ## %cond.load7
4444 ; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
4445 ; AVX512VLDQ-NEXT: testb $16, %al
4446 ; AVX512VLDQ-NEXT: je LBB23_10
4447 ; AVX512VLDQ-NEXT: LBB23_9: ## %cond.load10
4448 ; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
4449 ; AVX512VLDQ-NEXT: testb $32, %al
4450 ; AVX512VLDQ-NEXT: je LBB23_12
4451 ; AVX512VLDQ-NEXT: LBB23_11: ## %cond.load13
4452 ; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
4453 ; AVX512VLDQ-NEXT: testb $64, %al
4454 ; AVX512VLDQ-NEXT: je LBB23_14
4455 ; AVX512VLDQ-NEXT: LBB23_13: ## %cond.load16
4456 ; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
4457 ; AVX512VLDQ-NEXT: testb %al, %al
4458 ; AVX512VLDQ-NEXT: jns LBB23_16
4459 ; AVX512VLDQ-NEXT: LBB23_15: ## %cond.load19
4460 ; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
4461 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
4462 ; AVX512VLDQ-NEXT: je LBB23_18
4463 ; AVX512VLDQ-NEXT: LBB23_17: ## %cond.load22
4464 ; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
4465 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
4466 ; AVX512VLDQ-NEXT: je LBB23_20
4467 ; AVX512VLDQ-NEXT: LBB23_19: ## %cond.load25
4468 ; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
4469 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
4470 ; AVX512VLDQ-NEXT: je LBB23_22
4471 ; AVX512VLDQ-NEXT: LBB23_21: ## %cond.load28
4472 ; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
4473 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
4474 ; AVX512VLDQ-NEXT: je LBB23_24
4475 ; AVX512VLDQ-NEXT: LBB23_23: ## %cond.load31
4476 ; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
4477 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
4478 ; AVX512VLDQ-NEXT: je LBB23_26
4479 ; AVX512VLDQ-NEXT: LBB23_25: ## %cond.load34
4480 ; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
4481 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
4482 ; AVX512VLDQ-NEXT: je LBB23_28
4483 ; AVX512VLDQ-NEXT: LBB23_27: ## %cond.load37
4484 ; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
4485 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
4486 ; AVX512VLDQ-NEXT: je LBB23_30
4487 ; AVX512VLDQ-NEXT: LBB23_29: ## %cond.load40
4488 ; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
4489 ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000
4490 ; AVX512VLDQ-NEXT: je LBB23_32
4491 ; AVX512VLDQ-NEXT: LBB23_31: ## %cond.load43
4492 ; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
4493 ; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0
4494 ; AVX512VLDQ-NEXT: retq
4496 ; AVX512VLBW-LABEL: load_v16i8_v16i8:
4497 ; AVX512VLBW: ## %bb.0:
4498 ; AVX512VLBW-NEXT: vpmovb2m %xmm0, %k1
4499 ; AVX512VLBW-NEXT: vpblendmb (%rdi), %xmm1, %xmm0 {%k1}
4500 ; AVX512VLBW-NEXT: retq
4502 ; X86-AVX512-LABEL: load_v16i8_v16i8:
4503 ; X86-AVX512: ## %bb.0:
4504 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
4505 ; X86-AVX512-NEXT: vpmovb2m %xmm0, %k1
4506 ; X86-AVX512-NEXT: vpblendmb (%eax), %xmm1, %xmm0 {%k1}
4507 ; X86-AVX512-NEXT: retl
4508 %mask = icmp slt <16 x i8> %trigger, zeroinitializer
4509 %res = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x i8> %dst)
4513 define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %dst) {
4514 ; SSE2-LABEL: load_v32i8_v32i8:
4516 ; SSE2-NEXT: pmovmskb %xmm0, %ecx
4517 ; SSE2-NEXT: pmovmskb %xmm1, %eax
4518 ; SSE2-NEXT: shll $16, %eax
4519 ; SSE2-NEXT: orl %ecx, %eax
4520 ; SSE2-NEXT: testb $1, %al
4521 ; SSE2-NEXT: jne LBB24_1
4522 ; SSE2-NEXT: ## %bb.2: ## %else
4523 ; SSE2-NEXT: testb $2, %al
4524 ; SSE2-NEXT: jne LBB24_3
4525 ; SSE2-NEXT: LBB24_4: ## %else2
4526 ; SSE2-NEXT: testb $4, %al
4527 ; SSE2-NEXT: jne LBB24_5
4528 ; SSE2-NEXT: LBB24_6: ## %else5
4529 ; SSE2-NEXT: testb $8, %al
4530 ; SSE2-NEXT: jne LBB24_7
4531 ; SSE2-NEXT: LBB24_8: ## %else8
4532 ; SSE2-NEXT: testb $16, %al
4533 ; SSE2-NEXT: jne LBB24_9
4534 ; SSE2-NEXT: LBB24_10: ## %else11
4535 ; SSE2-NEXT: testb $32, %al
4536 ; SSE2-NEXT: jne LBB24_11
4537 ; SSE2-NEXT: LBB24_12: ## %else14
4538 ; SSE2-NEXT: testb $64, %al
4539 ; SSE2-NEXT: jne LBB24_13
4540 ; SSE2-NEXT: LBB24_14: ## %else17
4541 ; SSE2-NEXT: testb %al, %al
4542 ; SSE2-NEXT: js LBB24_15
4543 ; SSE2-NEXT: LBB24_16: ## %else20
4544 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
4545 ; SSE2-NEXT: jne LBB24_17
4546 ; SSE2-NEXT: LBB24_18: ## %else23
4547 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
4548 ; SSE2-NEXT: jne LBB24_19
4549 ; SSE2-NEXT: LBB24_20: ## %else26
4550 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
4551 ; SSE2-NEXT: jne LBB24_21
4552 ; SSE2-NEXT: LBB24_22: ## %else29
4553 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
4554 ; SSE2-NEXT: jne LBB24_23
4555 ; SSE2-NEXT: LBB24_24: ## %else32
4556 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
4557 ; SSE2-NEXT: jne LBB24_25
4558 ; SSE2-NEXT: LBB24_26: ## %else35
4559 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
4560 ; SSE2-NEXT: jne LBB24_27
4561 ; SSE2-NEXT: LBB24_28: ## %else38
4562 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
4563 ; SSE2-NEXT: jne LBB24_29
4564 ; SSE2-NEXT: LBB24_30: ## %else41
4565 ; SSE2-NEXT: testw %ax, %ax
4566 ; SSE2-NEXT: js LBB24_31
4567 ; SSE2-NEXT: LBB24_32: ## %else44
4568 ; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000
4569 ; SSE2-NEXT: jne LBB24_33
4570 ; SSE2-NEXT: LBB24_34: ## %else47
4571 ; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000
4572 ; SSE2-NEXT: jne LBB24_35
4573 ; SSE2-NEXT: LBB24_36: ## %else50
4574 ; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000
4575 ; SSE2-NEXT: jne LBB24_37
4576 ; SSE2-NEXT: LBB24_38: ## %else53
4577 ; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000
4578 ; SSE2-NEXT: jne LBB24_39
4579 ; SSE2-NEXT: LBB24_40: ## %else56
4580 ; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000
4581 ; SSE2-NEXT: jne LBB24_41
4582 ; SSE2-NEXT: LBB24_42: ## %else59
4583 ; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000
4584 ; SSE2-NEXT: jne LBB24_43
4585 ; SSE2-NEXT: LBB24_44: ## %else62
4586 ; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000
4587 ; SSE2-NEXT: jne LBB24_45
4588 ; SSE2-NEXT: LBB24_46: ## %else65
4589 ; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000
4590 ; SSE2-NEXT: jne LBB24_47
4591 ; SSE2-NEXT: LBB24_48: ## %else68
4592 ; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000
4593 ; SSE2-NEXT: jne LBB24_49
4594 ; SSE2-NEXT: LBB24_50: ## %else71
4595 ; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000
4596 ; SSE2-NEXT: jne LBB24_51
4597 ; SSE2-NEXT: LBB24_52: ## %else74
4598 ; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000
4599 ; SSE2-NEXT: jne LBB24_53
4600 ; SSE2-NEXT: LBB24_54: ## %else77
4601 ; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000
4602 ; SSE2-NEXT: jne LBB24_55
4603 ; SSE2-NEXT: LBB24_56: ## %else80
4604 ; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000
4605 ; SSE2-NEXT: jne LBB24_57
4606 ; SSE2-NEXT: LBB24_58: ## %else83
4607 ; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000
4608 ; SSE2-NEXT: jne LBB24_59
4609 ; SSE2-NEXT: LBB24_60: ## %else86
4610 ; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
4611 ; SSE2-NEXT: jne LBB24_61
4612 ; SSE2-NEXT: LBB24_62: ## %else89
4613 ; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
4614 ; SSE2-NEXT: je LBB24_64
4615 ; SSE2-NEXT: LBB24_63: ## %cond.load91
4616 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
4617 ; SSE2-NEXT: movzbl 31(%rdi), %eax
4618 ; SSE2-NEXT: movd %eax, %xmm0
4619 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4620 ; SSE2-NEXT: por %xmm0, %xmm3
4621 ; SSE2-NEXT: LBB24_64: ## %else92
4622 ; SSE2-NEXT: movdqa %xmm2, %xmm0
4623 ; SSE2-NEXT: movdqa %xmm3, %xmm1
4625 ; SSE2-NEXT: LBB24_1: ## %cond.load
4626 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4627 ; SSE2-NEXT: pand %xmm0, %xmm2
4628 ; SSE2-NEXT: movzbl (%rdi), %ecx
4629 ; SSE2-NEXT: movd %ecx, %xmm1
4630 ; SSE2-NEXT: pandn %xmm1, %xmm0
4631 ; SSE2-NEXT: por %xmm0, %xmm2
4632 ; SSE2-NEXT: testb $2, %al
4633 ; SSE2-NEXT: je LBB24_4
4634 ; SSE2-NEXT: LBB24_3: ## %cond.load1
4635 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4636 ; SSE2-NEXT: pand %xmm0, %xmm2
4637 ; SSE2-NEXT: movzbl 1(%rdi), %ecx
4638 ; SSE2-NEXT: movd %ecx, %xmm1
4639 ; SSE2-NEXT: psllw $8, %xmm1
4640 ; SSE2-NEXT: pandn %xmm1, %xmm0
4641 ; SSE2-NEXT: por %xmm0, %xmm2
4642 ; SSE2-NEXT: testb $4, %al
4643 ; SSE2-NEXT: je LBB24_6
4644 ; SSE2-NEXT: LBB24_5: ## %cond.load4
4645 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
4646 ; SSE2-NEXT: pand %xmm0, %xmm2
4647 ; SSE2-NEXT: movzbl 2(%rdi), %ecx
4648 ; SSE2-NEXT: movd %ecx, %xmm1
4649 ; SSE2-NEXT: pslld $16, %xmm1
4650 ; SSE2-NEXT: pandn %xmm1, %xmm0
4651 ; SSE2-NEXT: por %xmm0, %xmm2
4652 ; SSE2-NEXT: testb $8, %al
4653 ; SSE2-NEXT: je LBB24_8
4654 ; SSE2-NEXT: LBB24_7: ## %cond.load7
4655 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
4656 ; SSE2-NEXT: pand %xmm0, %xmm2
4657 ; SSE2-NEXT: movzbl 3(%rdi), %ecx
4658 ; SSE2-NEXT: movd %ecx, %xmm1
4659 ; SSE2-NEXT: pslld $24, %xmm1
4660 ; SSE2-NEXT: pandn %xmm1, %xmm0
4661 ; SSE2-NEXT: por %xmm0, %xmm2
4662 ; SSE2-NEXT: testb $16, %al
4663 ; SSE2-NEXT: je LBB24_10
4664 ; SSE2-NEXT: LBB24_9: ## %cond.load10
4665 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
4666 ; SSE2-NEXT: pand %xmm0, %xmm2
4667 ; SSE2-NEXT: movzbl 4(%rdi), %ecx
4668 ; SSE2-NEXT: movd %ecx, %xmm1
4669 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4670 ; SSE2-NEXT: pandn %xmm1, %xmm0
4671 ; SSE2-NEXT: por %xmm0, %xmm2
4672 ; SSE2-NEXT: testb $32, %al
4673 ; SSE2-NEXT: je LBB24_12
4674 ; SSE2-NEXT: LBB24_11: ## %cond.load13
4675 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
4676 ; SSE2-NEXT: pand %xmm0, %xmm2
4677 ; SSE2-NEXT: movzbl 5(%rdi), %ecx
4678 ; SSE2-NEXT: movd %ecx, %xmm1
4679 ; SSE2-NEXT: psllq $40, %xmm1
4680 ; SSE2-NEXT: pandn %xmm1, %xmm0
4681 ; SSE2-NEXT: por %xmm0, %xmm2
4682 ; SSE2-NEXT: testb $64, %al
4683 ; SSE2-NEXT: je LBB24_14
4684 ; SSE2-NEXT: LBB24_13: ## %cond.load16
4685 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
4686 ; SSE2-NEXT: pand %xmm0, %xmm2
4687 ; SSE2-NEXT: movzbl 6(%rdi), %ecx
4688 ; SSE2-NEXT: movd %ecx, %xmm1
4689 ; SSE2-NEXT: psllq $48, %xmm1
4690 ; SSE2-NEXT: pandn %xmm1, %xmm0
4691 ; SSE2-NEXT: por %xmm0, %xmm2
4692 ; SSE2-NEXT: testb %al, %al
4693 ; SSE2-NEXT: jns LBB24_16
4694 ; SSE2-NEXT: LBB24_15: ## %cond.load19
4695 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
4696 ; SSE2-NEXT: pand %xmm0, %xmm2
4697 ; SSE2-NEXT: movzbl 7(%rdi), %ecx
4698 ; SSE2-NEXT: movd %ecx, %xmm1
4699 ; SSE2-NEXT: psllq $56, %xmm1
4700 ; SSE2-NEXT: pandn %xmm1, %xmm0
4701 ; SSE2-NEXT: por %xmm0, %xmm2
4702 ; SSE2-NEXT: testl $256, %eax ## imm = 0x100
4703 ; SSE2-NEXT: je LBB24_18
4704 ; SSE2-NEXT: LBB24_17: ## %cond.load22
4705 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
4706 ; SSE2-NEXT: pand %xmm0, %xmm2
4707 ; SSE2-NEXT: movzbl 8(%rdi), %ecx
4708 ; SSE2-NEXT: movd %ecx, %xmm1
4709 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4710 ; SSE2-NEXT: pandn %xmm1, %xmm0
4711 ; SSE2-NEXT: por %xmm0, %xmm2
4712 ; SSE2-NEXT: testl $512, %eax ## imm = 0x200
4713 ; SSE2-NEXT: je LBB24_20
4714 ; SSE2-NEXT: LBB24_19: ## %cond.load25
4715 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
4716 ; SSE2-NEXT: pand %xmm0, %xmm2
4717 ; SSE2-NEXT: movzbl 9(%rdi), %ecx
4718 ; SSE2-NEXT: movd %ecx, %xmm1
4719 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
4720 ; SSE2-NEXT: pandn %xmm1, %xmm0
4721 ; SSE2-NEXT: por %xmm0, %xmm2
4722 ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400
4723 ; SSE2-NEXT: je LBB24_22
4724 ; SSE2-NEXT: LBB24_21: ## %cond.load28
4725 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
4726 ; SSE2-NEXT: pand %xmm0, %xmm2
4727 ; SSE2-NEXT: movzbl 10(%rdi), %ecx
4728 ; SSE2-NEXT: movd %ecx, %xmm1
4729 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4730 ; SSE2-NEXT: pandn %xmm1, %xmm0
4731 ; SSE2-NEXT: por %xmm0, %xmm2
4732 ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800
4733 ; SSE2-NEXT: je LBB24_24
4734 ; SSE2-NEXT: LBB24_23: ## %cond.load31
4735 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
4736 ; SSE2-NEXT: pand %xmm0, %xmm2
4737 ; SSE2-NEXT: movzbl 11(%rdi), %ecx
4738 ; SSE2-NEXT: movd %ecx, %xmm1
4739 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
4740 ; SSE2-NEXT: pandn %xmm1, %xmm0
4741 ; SSE2-NEXT: por %xmm0, %xmm2
4742 ; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000
4743 ; SSE2-NEXT: je LBB24_26
4744 ; SSE2-NEXT: LBB24_25: ## %cond.load34
4745 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
4746 ; SSE2-NEXT: pand %xmm0, %xmm2
4747 ; SSE2-NEXT: movzbl 12(%rdi), %ecx
4748 ; SSE2-NEXT: movd %ecx, %xmm1
4749 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4750 ; SSE2-NEXT: pandn %xmm1, %xmm0
4751 ; SSE2-NEXT: por %xmm0, %xmm2
4752 ; SSE2-NEXT: testl $8192, %eax ## imm = 0x2000
4753 ; SSE2-NEXT: je LBB24_28
4754 ; SSE2-NEXT: LBB24_27: ## %cond.load37
4755 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
4756 ; SSE2-NEXT: pand %xmm0, %xmm2
4757 ; SSE2-NEXT: movzbl 13(%rdi), %ecx
4758 ; SSE2-NEXT: movd %ecx, %xmm1
4759 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
4760 ; SSE2-NEXT: pandn %xmm1, %xmm0
4761 ; SSE2-NEXT: por %xmm0, %xmm2
4762 ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000
4763 ; SSE2-NEXT: je LBB24_30
4764 ; SSE2-NEXT: LBB24_29: ## %cond.load40
4765 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4766 ; SSE2-NEXT: pand %xmm0, %xmm2
4767 ; SSE2-NEXT: movzbl 14(%rdi), %ecx
4768 ; SSE2-NEXT: movd %ecx, %xmm1
4769 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
4770 ; SSE2-NEXT: pandn %xmm1, %xmm0
4771 ; SSE2-NEXT: por %xmm0, %xmm2
4772 ; SSE2-NEXT: testw %ax, %ax
4773 ; SSE2-NEXT: jns LBB24_32
4774 ; SSE2-NEXT: LBB24_31: ## %cond.load43
4775 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
4776 ; SSE2-NEXT: movzbl 15(%rdi), %ecx
4777 ; SSE2-NEXT: movd %ecx, %xmm0
4778 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
4779 ; SSE2-NEXT: por %xmm0, %xmm2
4780 ; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000
4781 ; SSE2-NEXT: je LBB24_34
4782 ; SSE2-NEXT: LBB24_33: ## %cond.load46
4783 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4784 ; SSE2-NEXT: pand %xmm0, %xmm3
4785 ; SSE2-NEXT: movzbl 16(%rdi), %ecx
4786 ; SSE2-NEXT: movd %ecx, %xmm1
4787 ; SSE2-NEXT: pandn %xmm1, %xmm0
4788 ; SSE2-NEXT: por %xmm0, %xmm3
4789 ; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000
4790 ; SSE2-NEXT: je LBB24_36
4791 ; SSE2-NEXT: LBB24_35: ## %cond.load49
4792 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4793 ; SSE2-NEXT: pand %xmm0, %xmm3
4794 ; SSE2-NEXT: movzbl 17(%rdi), %ecx
4795 ; SSE2-NEXT: movd %ecx, %xmm1
4796 ; SSE2-NEXT: psllw $8, %xmm1
4797 ; SSE2-NEXT: pandn %xmm1, %xmm0
4798 ; SSE2-NEXT: por %xmm0, %xmm3
4799 ; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000
4800 ; SSE2-NEXT: je LBB24_38
4801 ; SSE2-NEXT: LBB24_37: ## %cond.load52
4802 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
4803 ; SSE2-NEXT: pand %xmm0, %xmm3
4804 ; SSE2-NEXT: movzbl 18(%rdi), %ecx
4805 ; SSE2-NEXT: movd %ecx, %xmm1
4806 ; SSE2-NEXT: pslld $16, %xmm1
4807 ; SSE2-NEXT: pandn %xmm1, %xmm0
4808 ; SSE2-NEXT: por %xmm0, %xmm3
4809 ; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000
4810 ; SSE2-NEXT: je LBB24_40
4811 ; SSE2-NEXT: LBB24_39: ## %cond.load55
4812 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
4813 ; SSE2-NEXT: pand %xmm0, %xmm3
4814 ; SSE2-NEXT: movzbl 19(%rdi), %ecx
4815 ; SSE2-NEXT: movd %ecx, %xmm1
4816 ; SSE2-NEXT: pslld $24, %xmm1
4817 ; SSE2-NEXT: pandn %xmm1, %xmm0
4818 ; SSE2-NEXT: por %xmm0, %xmm3
4819 ; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000
4820 ; SSE2-NEXT: je LBB24_42
4821 ; SSE2-NEXT: LBB24_41: ## %cond.load58
4822 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
4823 ; SSE2-NEXT: pand %xmm0, %xmm3
4824 ; SSE2-NEXT: movzbl 20(%rdi), %ecx
4825 ; SSE2-NEXT: movd %ecx, %xmm1
4826 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4827 ; SSE2-NEXT: pandn %xmm1, %xmm0
4828 ; SSE2-NEXT: por %xmm0, %xmm3
4829 ; SSE2-NEXT: testl $2097152, %eax ## imm = 0x200000
4830 ; SSE2-NEXT: je LBB24_44
4831 ; SSE2-NEXT: LBB24_43: ## %cond.load61
4832 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
4833 ; SSE2-NEXT: pand %xmm0, %xmm3
4834 ; SSE2-NEXT: movzbl 21(%rdi), %ecx
4835 ; SSE2-NEXT: movd %ecx, %xmm1
4836 ; SSE2-NEXT: psllq $40, %xmm1
4837 ; SSE2-NEXT: pandn %xmm1, %xmm0
4838 ; SSE2-NEXT: por %xmm0, %xmm3
4839 ; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000
4840 ; SSE2-NEXT: je LBB24_46
4841 ; SSE2-NEXT: LBB24_45: ## %cond.load64
4842 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
4843 ; SSE2-NEXT: pand %xmm0, %xmm3
4844 ; SSE2-NEXT: movzbl 22(%rdi), %ecx
4845 ; SSE2-NEXT: movd %ecx, %xmm1
4846 ; SSE2-NEXT: psllq $48, %xmm1
4847 ; SSE2-NEXT: pandn %xmm1, %xmm0
4848 ; SSE2-NEXT: por %xmm0, %xmm3
4849 ; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000
4850 ; SSE2-NEXT: je LBB24_48
4851 ; SSE2-NEXT: LBB24_47: ## %cond.load67
4852 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
4853 ; SSE2-NEXT: pand %xmm0, %xmm3
4854 ; SSE2-NEXT: movzbl 23(%rdi), %ecx
4855 ; SSE2-NEXT: movd %ecx, %xmm1
4856 ; SSE2-NEXT: psllq $56, %xmm1
4857 ; SSE2-NEXT: pandn %xmm1, %xmm0
4858 ; SSE2-NEXT: por %xmm0, %xmm3
4859 ; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000
4860 ; SSE2-NEXT: je LBB24_50
4861 ; SSE2-NEXT: LBB24_49: ## %cond.load70
4862 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
4863 ; SSE2-NEXT: pand %xmm0, %xmm3
4864 ; SSE2-NEXT: movzbl 24(%rdi), %ecx
4865 ; SSE2-NEXT: movd %ecx, %xmm1
4866 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4867 ; SSE2-NEXT: pandn %xmm1, %xmm0
4868 ; SSE2-NEXT: por %xmm0, %xmm3
4869 ; SSE2-NEXT: testl $33554432, %eax ## imm = 0x2000000
4870 ; SSE2-NEXT: je LBB24_52
4871 ; SSE2-NEXT: LBB24_51: ## %cond.load73
4872 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
4873 ; SSE2-NEXT: pand %xmm0, %xmm3
4874 ; SSE2-NEXT: movzbl 25(%rdi), %ecx
4875 ; SSE2-NEXT: movd %ecx, %xmm1
4876 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
4877 ; SSE2-NEXT: pandn %xmm1, %xmm0
4878 ; SSE2-NEXT: por %xmm0, %xmm3
4879 ; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000
4880 ; SSE2-NEXT: je LBB24_54
4881 ; SSE2-NEXT: LBB24_53: ## %cond.load76
4882 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
4883 ; SSE2-NEXT: pand %xmm0, %xmm3
4884 ; SSE2-NEXT: movzbl 26(%rdi), %ecx
4885 ; SSE2-NEXT: movd %ecx, %xmm1
4886 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4887 ; SSE2-NEXT: pandn %xmm1, %xmm0
4888 ; SSE2-NEXT: por %xmm0, %xmm3
4889 ; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000
4890 ; SSE2-NEXT: je LBB24_56
4891 ; SSE2-NEXT: LBB24_55: ## %cond.load79
4892 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
4893 ; SSE2-NEXT: pand %xmm0, %xmm3
4894 ; SSE2-NEXT: movzbl 27(%rdi), %ecx
4895 ; SSE2-NEXT: movd %ecx, %xmm1
4896 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
4897 ; SSE2-NEXT: pandn %xmm1, %xmm0
4898 ; SSE2-NEXT: por %xmm0, %xmm3
4899 ; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000
4900 ; SSE2-NEXT: je LBB24_58
4901 ; SSE2-NEXT: LBB24_57: ## %cond.load82
4902 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
4903 ; SSE2-NEXT: pand %xmm0, %xmm3
4904 ; SSE2-NEXT: movzbl 28(%rdi), %ecx
4905 ; SSE2-NEXT: movd %ecx, %xmm1
4906 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4907 ; SSE2-NEXT: pandn %xmm1, %xmm0
4908 ; SSE2-NEXT: por %xmm0, %xmm3
4909 ; SSE2-NEXT: testl $536870912, %eax ## imm = 0x20000000
4910 ; SSE2-NEXT: je LBB24_60
4911 ; SSE2-NEXT: LBB24_59: ## %cond.load85
4912 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
4913 ; SSE2-NEXT: pand %xmm0, %xmm3
4914 ; SSE2-NEXT: movzbl 29(%rdi), %ecx
4915 ; SSE2-NEXT: movd %ecx, %xmm1
4916 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
4917 ; SSE2-NEXT: pandn %xmm1, %xmm0
4918 ; SSE2-NEXT: por %xmm0, %xmm3
4919 ; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
4920 ; SSE2-NEXT: je LBB24_62
4921 ; SSE2-NEXT: LBB24_61: ## %cond.load88
4922 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
4923 ; SSE2-NEXT: pand %xmm0, %xmm3
4924 ; SSE2-NEXT: movzbl 30(%rdi), %ecx
4925 ; SSE2-NEXT: movd %ecx, %xmm1
4926 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
4927 ; SSE2-NEXT: pandn %xmm1, %xmm0
4928 ; SSE2-NEXT: por %xmm0, %xmm3
4929 ; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
4930 ; SSE2-NEXT: jne LBB24_63
4931 ; SSE2-NEXT: jmp LBB24_64
4933 ; SSE42-LABEL: load_v32i8_v32i8:
4935 ; SSE42-NEXT: pmovmskb %xmm0, %ecx
4936 ; SSE42-NEXT: pmovmskb %xmm1, %eax
4937 ; SSE42-NEXT: shll $16, %eax
4938 ; SSE42-NEXT: orl %ecx, %eax
4939 ; SSE42-NEXT: testb $1, %al
4940 ; SSE42-NEXT: jne LBB24_1
4941 ; SSE42-NEXT: ## %bb.2: ## %else
4942 ; SSE42-NEXT: testb $2, %al
4943 ; SSE42-NEXT: jne LBB24_3
4944 ; SSE42-NEXT: LBB24_4: ## %else2
4945 ; SSE42-NEXT: testb $4, %al
4946 ; SSE42-NEXT: jne LBB24_5
4947 ; SSE42-NEXT: LBB24_6: ## %else5
4948 ; SSE42-NEXT: testb $8, %al
4949 ; SSE42-NEXT: jne LBB24_7
4950 ; SSE42-NEXT: LBB24_8: ## %else8
4951 ; SSE42-NEXT: testb $16, %al
4952 ; SSE42-NEXT: jne LBB24_9
4953 ; SSE42-NEXT: LBB24_10: ## %else11
4954 ; SSE42-NEXT: testb $32, %al
4955 ; SSE42-NEXT: jne LBB24_11
4956 ; SSE42-NEXT: LBB24_12: ## %else14
4957 ; SSE42-NEXT: testb $64, %al
4958 ; SSE42-NEXT: jne LBB24_13
4959 ; SSE42-NEXT: LBB24_14: ## %else17
4960 ; SSE42-NEXT: testb %al, %al
4961 ; SSE42-NEXT: js LBB24_15
4962 ; SSE42-NEXT: LBB24_16: ## %else20
4963 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
4964 ; SSE42-NEXT: jne LBB24_17
4965 ; SSE42-NEXT: LBB24_18: ## %else23
4966 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
4967 ; SSE42-NEXT: jne LBB24_19
4968 ; SSE42-NEXT: LBB24_20: ## %else26
4969 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
4970 ; SSE42-NEXT: jne LBB24_21
4971 ; SSE42-NEXT: LBB24_22: ## %else29
4972 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
4973 ; SSE42-NEXT: jne LBB24_23
4974 ; SSE42-NEXT: LBB24_24: ## %else32
4975 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
4976 ; SSE42-NEXT: jne LBB24_25
4977 ; SSE42-NEXT: LBB24_26: ## %else35
4978 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
4979 ; SSE42-NEXT: jne LBB24_27
4980 ; SSE42-NEXT: LBB24_28: ## %else38
4981 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
4982 ; SSE42-NEXT: jne LBB24_29
4983 ; SSE42-NEXT: LBB24_30: ## %else41
4984 ; SSE42-NEXT: testw %ax, %ax
4985 ; SSE42-NEXT: js LBB24_31
4986 ; SSE42-NEXT: LBB24_32: ## %else44
4987 ; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000
4988 ; SSE42-NEXT: jne LBB24_33
4989 ; SSE42-NEXT: LBB24_34: ## %else47
4990 ; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000
4991 ; SSE42-NEXT: jne LBB24_35
4992 ; SSE42-NEXT: LBB24_36: ## %else50
4993 ; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000
4994 ; SSE42-NEXT: jne LBB24_37
4995 ; SSE42-NEXT: LBB24_38: ## %else53
4996 ; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000
4997 ; SSE42-NEXT: jne LBB24_39
4998 ; SSE42-NEXT: LBB24_40: ## %else56
4999 ; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000
5000 ; SSE42-NEXT: jne LBB24_41
5001 ; SSE42-NEXT: LBB24_42: ## %else59
5002 ; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000
5003 ; SSE42-NEXT: jne LBB24_43
5004 ; SSE42-NEXT: LBB24_44: ## %else62
5005 ; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000
5006 ; SSE42-NEXT: jne LBB24_45
5007 ; SSE42-NEXT: LBB24_46: ## %else65
5008 ; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000
5009 ; SSE42-NEXT: jne LBB24_47
5010 ; SSE42-NEXT: LBB24_48: ## %else68
5011 ; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000
5012 ; SSE42-NEXT: jne LBB24_49
5013 ; SSE42-NEXT: LBB24_50: ## %else71
5014 ; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000
5015 ; SSE42-NEXT: jne LBB24_51
5016 ; SSE42-NEXT: LBB24_52: ## %else74
5017 ; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000
5018 ; SSE42-NEXT: jne LBB24_53
5019 ; SSE42-NEXT: LBB24_54: ## %else77
5020 ; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000
5021 ; SSE42-NEXT: jne LBB24_55
5022 ; SSE42-NEXT: LBB24_56: ## %else80
5023 ; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000
5024 ; SSE42-NEXT: jne LBB24_57
5025 ; SSE42-NEXT: LBB24_58: ## %else83
5026 ; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000
5027 ; SSE42-NEXT: jne LBB24_59
5028 ; SSE42-NEXT: LBB24_60: ## %else86
5029 ; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5030 ; SSE42-NEXT: jne LBB24_61
5031 ; SSE42-NEXT: LBB24_62: ## %else89
5032 ; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5033 ; SSE42-NEXT: je LBB24_64
5034 ; SSE42-NEXT: LBB24_63: ## %cond.load91
5035 ; SSE42-NEXT: pinsrb $15, 31(%rdi), %xmm3
5036 ; SSE42-NEXT: LBB24_64: ## %else92
5037 ; SSE42-NEXT: movdqa %xmm2, %xmm0
5038 ; SSE42-NEXT: movdqa %xmm3, %xmm1
5040 ; SSE42-NEXT: LBB24_1: ## %cond.load
5041 ; SSE42-NEXT: pinsrb $0, (%rdi), %xmm2
5042 ; SSE42-NEXT: testb $2, %al
5043 ; SSE42-NEXT: je LBB24_4
5044 ; SSE42-NEXT: LBB24_3: ## %cond.load1
5045 ; SSE42-NEXT: pinsrb $1, 1(%rdi), %xmm2
5046 ; SSE42-NEXT: testb $4, %al
5047 ; SSE42-NEXT: je LBB24_6
5048 ; SSE42-NEXT: LBB24_5: ## %cond.load4
5049 ; SSE42-NEXT: pinsrb $2, 2(%rdi), %xmm2
5050 ; SSE42-NEXT: testb $8, %al
5051 ; SSE42-NEXT: je LBB24_8
5052 ; SSE42-NEXT: LBB24_7: ## %cond.load7
5053 ; SSE42-NEXT: pinsrb $3, 3(%rdi), %xmm2
5054 ; SSE42-NEXT: testb $16, %al
5055 ; SSE42-NEXT: je LBB24_10
5056 ; SSE42-NEXT: LBB24_9: ## %cond.load10
5057 ; SSE42-NEXT: pinsrb $4, 4(%rdi), %xmm2
5058 ; SSE42-NEXT: testb $32, %al
5059 ; SSE42-NEXT: je LBB24_12
5060 ; SSE42-NEXT: LBB24_11: ## %cond.load13
5061 ; SSE42-NEXT: pinsrb $5, 5(%rdi), %xmm2
5062 ; SSE42-NEXT: testb $64, %al
5063 ; SSE42-NEXT: je LBB24_14
5064 ; SSE42-NEXT: LBB24_13: ## %cond.load16
5065 ; SSE42-NEXT: pinsrb $6, 6(%rdi), %xmm2
5066 ; SSE42-NEXT: testb %al, %al
5067 ; SSE42-NEXT: jns LBB24_16
5068 ; SSE42-NEXT: LBB24_15: ## %cond.load19
5069 ; SSE42-NEXT: pinsrb $7, 7(%rdi), %xmm2
5070 ; SSE42-NEXT: testl $256, %eax ## imm = 0x100
5071 ; SSE42-NEXT: je LBB24_18
5072 ; SSE42-NEXT: LBB24_17: ## %cond.load22
5073 ; SSE42-NEXT: pinsrb $8, 8(%rdi), %xmm2
5074 ; SSE42-NEXT: testl $512, %eax ## imm = 0x200
5075 ; SSE42-NEXT: je LBB24_20
5076 ; SSE42-NEXT: LBB24_19: ## %cond.load25
5077 ; SSE42-NEXT: pinsrb $9, 9(%rdi), %xmm2
5078 ; SSE42-NEXT: testl $1024, %eax ## imm = 0x400
5079 ; SSE42-NEXT: je LBB24_22
5080 ; SSE42-NEXT: LBB24_21: ## %cond.load28
5081 ; SSE42-NEXT: pinsrb $10, 10(%rdi), %xmm2
5082 ; SSE42-NEXT: testl $2048, %eax ## imm = 0x800
5083 ; SSE42-NEXT: je LBB24_24
5084 ; SSE42-NEXT: LBB24_23: ## %cond.load31
5085 ; SSE42-NEXT: pinsrb $11, 11(%rdi), %xmm2
5086 ; SSE42-NEXT: testl $4096, %eax ## imm = 0x1000
5087 ; SSE42-NEXT: je LBB24_26
5088 ; SSE42-NEXT: LBB24_25: ## %cond.load34
5089 ; SSE42-NEXT: pinsrb $12, 12(%rdi), %xmm2
5090 ; SSE42-NEXT: testl $8192, %eax ## imm = 0x2000
5091 ; SSE42-NEXT: je LBB24_28
5092 ; SSE42-NEXT: LBB24_27: ## %cond.load37
5093 ; SSE42-NEXT: pinsrb $13, 13(%rdi), %xmm2
5094 ; SSE42-NEXT: testl $16384, %eax ## imm = 0x4000
5095 ; SSE42-NEXT: je LBB24_30
5096 ; SSE42-NEXT: LBB24_29: ## %cond.load40
5097 ; SSE42-NEXT: pinsrb $14, 14(%rdi), %xmm2
5098 ; SSE42-NEXT: testw %ax, %ax
5099 ; SSE42-NEXT: jns LBB24_32
5100 ; SSE42-NEXT: LBB24_31: ## %cond.load43
5101 ; SSE42-NEXT: pinsrb $15, 15(%rdi), %xmm2
5102 ; SSE42-NEXT: testl $65536, %eax ## imm = 0x10000
5103 ; SSE42-NEXT: je LBB24_34
5104 ; SSE42-NEXT: LBB24_33: ## %cond.load46
5105 ; SSE42-NEXT: pinsrb $0, 16(%rdi), %xmm3
5106 ; SSE42-NEXT: testl $131072, %eax ## imm = 0x20000
5107 ; SSE42-NEXT: je LBB24_36
5108 ; SSE42-NEXT: LBB24_35: ## %cond.load49
5109 ; SSE42-NEXT: pinsrb $1, 17(%rdi), %xmm3
5110 ; SSE42-NEXT: testl $262144, %eax ## imm = 0x40000
5111 ; SSE42-NEXT: je LBB24_38
5112 ; SSE42-NEXT: LBB24_37: ## %cond.load52
5113 ; SSE42-NEXT: pinsrb $2, 18(%rdi), %xmm3
5114 ; SSE42-NEXT: testl $524288, %eax ## imm = 0x80000
5115 ; SSE42-NEXT: je LBB24_40
5116 ; SSE42-NEXT: LBB24_39: ## %cond.load55
5117 ; SSE42-NEXT: pinsrb $3, 19(%rdi), %xmm3
5118 ; SSE42-NEXT: testl $1048576, %eax ## imm = 0x100000
5119 ; SSE42-NEXT: je LBB24_42
5120 ; SSE42-NEXT: LBB24_41: ## %cond.load58
5121 ; SSE42-NEXT: pinsrb $4, 20(%rdi), %xmm3
5122 ; SSE42-NEXT: testl $2097152, %eax ## imm = 0x200000
5123 ; SSE42-NEXT: je LBB24_44
5124 ; SSE42-NEXT: LBB24_43: ## %cond.load61
5125 ; SSE42-NEXT: pinsrb $5, 21(%rdi), %xmm3
5126 ; SSE42-NEXT: testl $4194304, %eax ## imm = 0x400000
5127 ; SSE42-NEXT: je LBB24_46
5128 ; SSE42-NEXT: LBB24_45: ## %cond.load64
5129 ; SSE42-NEXT: pinsrb $6, 22(%rdi), %xmm3
5130 ; SSE42-NEXT: testl $8388608, %eax ## imm = 0x800000
5131 ; SSE42-NEXT: je LBB24_48
5132 ; SSE42-NEXT: LBB24_47: ## %cond.load67
5133 ; SSE42-NEXT: pinsrb $7, 23(%rdi), %xmm3
5134 ; SSE42-NEXT: testl $16777216, %eax ## imm = 0x1000000
5135 ; SSE42-NEXT: je LBB24_50
5136 ; SSE42-NEXT: LBB24_49: ## %cond.load70
5137 ; SSE42-NEXT: pinsrb $8, 24(%rdi), %xmm3
5138 ; SSE42-NEXT: testl $33554432, %eax ## imm = 0x2000000
5139 ; SSE42-NEXT: je LBB24_52
5140 ; SSE42-NEXT: LBB24_51: ## %cond.load73
5141 ; SSE42-NEXT: pinsrb $9, 25(%rdi), %xmm3
5142 ; SSE42-NEXT: testl $67108864, %eax ## imm = 0x4000000
5143 ; SSE42-NEXT: je LBB24_54
5144 ; SSE42-NEXT: LBB24_53: ## %cond.load76
5145 ; SSE42-NEXT: pinsrb $10, 26(%rdi), %xmm3
5146 ; SSE42-NEXT: testl $134217728, %eax ## imm = 0x8000000
5147 ; SSE42-NEXT: je LBB24_56
5148 ; SSE42-NEXT: LBB24_55: ## %cond.load79
5149 ; SSE42-NEXT: pinsrb $11, 27(%rdi), %xmm3
5150 ; SSE42-NEXT: testl $268435456, %eax ## imm = 0x10000000
5151 ; SSE42-NEXT: je LBB24_58
5152 ; SSE42-NEXT: LBB24_57: ## %cond.load82
5153 ; SSE42-NEXT: pinsrb $12, 28(%rdi), %xmm3
5154 ; SSE42-NEXT: testl $536870912, %eax ## imm = 0x20000000
5155 ; SSE42-NEXT: je LBB24_60
5156 ; SSE42-NEXT: LBB24_59: ## %cond.load85
5157 ; SSE42-NEXT: pinsrb $13, 29(%rdi), %xmm3
5158 ; SSE42-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5159 ; SSE42-NEXT: je LBB24_62
5160 ; SSE42-NEXT: LBB24_61: ## %cond.load88
5161 ; SSE42-NEXT: pinsrb $14, 30(%rdi), %xmm3
5162 ; SSE42-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5163 ; SSE42-NEXT: jne LBB24_63
5164 ; SSE42-NEXT: jmp LBB24_64
5166 ; AVX1-LABEL: load_v32i8_v32i8:
5168 ; AVX1-NEXT: vpmovmskb %xmm0, %ecx
5169 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
5170 ; AVX1-NEXT: vpmovmskb %xmm0, %eax
5171 ; AVX1-NEXT: shll $16, %eax
5172 ; AVX1-NEXT: orl %ecx, %eax
5173 ; AVX1-NEXT: testb $1, %al
5174 ; AVX1-NEXT: jne LBB24_1
5175 ; AVX1-NEXT: ## %bb.2: ## %else
5176 ; AVX1-NEXT: testb $2, %al
5177 ; AVX1-NEXT: jne LBB24_3
5178 ; AVX1-NEXT: LBB24_4: ## %else2
5179 ; AVX1-NEXT: testb $4, %al
5180 ; AVX1-NEXT: jne LBB24_5
5181 ; AVX1-NEXT: LBB24_6: ## %else5
5182 ; AVX1-NEXT: testb $8, %al
5183 ; AVX1-NEXT: jne LBB24_7
5184 ; AVX1-NEXT: LBB24_8: ## %else8
5185 ; AVX1-NEXT: testb $16, %al
5186 ; AVX1-NEXT: jne LBB24_9
5187 ; AVX1-NEXT: LBB24_10: ## %else11
5188 ; AVX1-NEXT: testb $32, %al
5189 ; AVX1-NEXT: jne LBB24_11
5190 ; AVX1-NEXT: LBB24_12: ## %else14
5191 ; AVX1-NEXT: testb $64, %al
5192 ; AVX1-NEXT: jne LBB24_13
5193 ; AVX1-NEXT: LBB24_14: ## %else17
5194 ; AVX1-NEXT: testb %al, %al
5195 ; AVX1-NEXT: js LBB24_15
5196 ; AVX1-NEXT: LBB24_16: ## %else20
5197 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
5198 ; AVX1-NEXT: jne LBB24_17
5199 ; AVX1-NEXT: LBB24_18: ## %else23
5200 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
5201 ; AVX1-NEXT: jne LBB24_19
5202 ; AVX1-NEXT: LBB24_20: ## %else26
5203 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
5204 ; AVX1-NEXT: jne LBB24_21
5205 ; AVX1-NEXT: LBB24_22: ## %else29
5206 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
5207 ; AVX1-NEXT: jne LBB24_23
5208 ; AVX1-NEXT: LBB24_24: ## %else32
5209 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
5210 ; AVX1-NEXT: jne LBB24_25
5211 ; AVX1-NEXT: LBB24_26: ## %else35
5212 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
5213 ; AVX1-NEXT: jne LBB24_27
5214 ; AVX1-NEXT: LBB24_28: ## %else38
5215 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
5216 ; AVX1-NEXT: jne LBB24_29
5217 ; AVX1-NEXT: LBB24_30: ## %else41
5218 ; AVX1-NEXT: testw %ax, %ax
5219 ; AVX1-NEXT: js LBB24_31
5220 ; AVX1-NEXT: LBB24_32: ## %else44
5221 ; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
5222 ; AVX1-NEXT: jne LBB24_33
5223 ; AVX1-NEXT: LBB24_34: ## %else47
5224 ; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
5225 ; AVX1-NEXT: jne LBB24_35
5226 ; AVX1-NEXT: LBB24_36: ## %else50
5227 ; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000
5228 ; AVX1-NEXT: jne LBB24_37
5229 ; AVX1-NEXT: LBB24_38: ## %else53
5230 ; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000
5231 ; AVX1-NEXT: jne LBB24_39
5232 ; AVX1-NEXT: LBB24_40: ## %else56
5233 ; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
5234 ; AVX1-NEXT: jne LBB24_41
5235 ; AVX1-NEXT: LBB24_42: ## %else59
5236 ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000
5237 ; AVX1-NEXT: jne LBB24_43
5238 ; AVX1-NEXT: LBB24_44: ## %else62
5239 ; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
5240 ; AVX1-NEXT: jne LBB24_45
5241 ; AVX1-NEXT: LBB24_46: ## %else65
5242 ; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
5243 ; AVX1-NEXT: jne LBB24_47
5244 ; AVX1-NEXT: LBB24_48: ## %else68
5245 ; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
5246 ; AVX1-NEXT: jne LBB24_49
5247 ; AVX1-NEXT: LBB24_50: ## %else71
5248 ; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
5249 ; AVX1-NEXT: jne LBB24_51
5250 ; AVX1-NEXT: LBB24_52: ## %else74
5251 ; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000
5252 ; AVX1-NEXT: jne LBB24_53
5253 ; AVX1-NEXT: LBB24_54: ## %else77
5254 ; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000
5255 ; AVX1-NEXT: jne LBB24_55
5256 ; AVX1-NEXT: LBB24_56: ## %else80
5257 ; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
5258 ; AVX1-NEXT: jne LBB24_57
5259 ; AVX1-NEXT: LBB24_58: ## %else83
5260 ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000
5261 ; AVX1-NEXT: jne LBB24_59
5262 ; AVX1-NEXT: LBB24_60: ## %else86
5263 ; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5264 ; AVX1-NEXT: jne LBB24_61
5265 ; AVX1-NEXT: LBB24_62: ## %else89
5266 ; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5267 ; AVX1-NEXT: jne LBB24_63
5268 ; AVX1-NEXT: LBB24_64: ## %else92
5269 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
5271 ; AVX1-NEXT: LBB24_1: ## %cond.load
5272 ; AVX1-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
5273 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5274 ; AVX1-NEXT: testb $2, %al
5275 ; AVX1-NEXT: je LBB24_4
5276 ; AVX1-NEXT: LBB24_3: ## %cond.load1
5277 ; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5278 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5279 ; AVX1-NEXT: testb $4, %al
5280 ; AVX1-NEXT: je LBB24_6
5281 ; AVX1-NEXT: LBB24_5: ## %cond.load4
5282 ; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5283 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5284 ; AVX1-NEXT: testb $8, %al
5285 ; AVX1-NEXT: je LBB24_8
5286 ; AVX1-NEXT: LBB24_7: ## %cond.load7
5287 ; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5288 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5289 ; AVX1-NEXT: testb $16, %al
5290 ; AVX1-NEXT: je LBB24_10
5291 ; AVX1-NEXT: LBB24_9: ## %cond.load10
5292 ; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5293 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5294 ; AVX1-NEXT: testb $32, %al
5295 ; AVX1-NEXT: je LBB24_12
5296 ; AVX1-NEXT: LBB24_11: ## %cond.load13
5297 ; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5298 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5299 ; AVX1-NEXT: testb $64, %al
5300 ; AVX1-NEXT: je LBB24_14
5301 ; AVX1-NEXT: LBB24_13: ## %cond.load16
5302 ; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5303 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5304 ; AVX1-NEXT: testb %al, %al
5305 ; AVX1-NEXT: jns LBB24_16
5306 ; AVX1-NEXT: LBB24_15: ## %cond.load19
5307 ; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5308 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5309 ; AVX1-NEXT: testl $256, %eax ## imm = 0x100
5310 ; AVX1-NEXT: je LBB24_18
5311 ; AVX1-NEXT: LBB24_17: ## %cond.load22
5312 ; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5313 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5314 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200
5315 ; AVX1-NEXT: je LBB24_20
5316 ; AVX1-NEXT: LBB24_19: ## %cond.load25
5317 ; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5318 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5319 ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
5320 ; AVX1-NEXT: je LBB24_22
5321 ; AVX1-NEXT: LBB24_21: ## %cond.load28
5322 ; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5323 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5324 ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
5325 ; AVX1-NEXT: je LBB24_24
5326 ; AVX1-NEXT: LBB24_23: ## %cond.load31
5327 ; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5328 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5329 ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
5330 ; AVX1-NEXT: je LBB24_26
5331 ; AVX1-NEXT: LBB24_25: ## %cond.load34
5332 ; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5333 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5334 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
5335 ; AVX1-NEXT: je LBB24_28
5336 ; AVX1-NEXT: LBB24_27: ## %cond.load37
5337 ; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5338 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5339 ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
5340 ; AVX1-NEXT: je LBB24_30
5341 ; AVX1-NEXT: LBB24_29: ## %cond.load40
5342 ; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5343 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5344 ; AVX1-NEXT: testw %ax, %ax
5345 ; AVX1-NEXT: jns LBB24_32
5346 ; AVX1-NEXT: LBB24_31: ## %cond.load43
5347 ; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5348 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5349 ; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
5350 ; AVX1-NEXT: je LBB24_34
5351 ; AVX1-NEXT: LBB24_33: ## %cond.load46
5352 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5353 ; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5354 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5355 ; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
5356 ; AVX1-NEXT: je LBB24_36
5357 ; AVX1-NEXT: LBB24_35: ## %cond.load49
5358 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5359 ; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5360 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5361 ; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000
5362 ; AVX1-NEXT: je LBB24_38
5363 ; AVX1-NEXT: LBB24_37: ## %cond.load52
5364 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5365 ; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5366 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5367 ; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000
5368 ; AVX1-NEXT: je LBB24_40
5369 ; AVX1-NEXT: LBB24_39: ## %cond.load55
5370 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5371 ; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5372 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5373 ; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
5374 ; AVX1-NEXT: je LBB24_42
5375 ; AVX1-NEXT: LBB24_41: ## %cond.load58
5376 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5377 ; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5378 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5379 ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000
5380 ; AVX1-NEXT: je LBB24_44
5381 ; AVX1-NEXT: LBB24_43: ## %cond.load61
5382 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5383 ; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5384 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5385 ; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
5386 ; AVX1-NEXT: je LBB24_46
5387 ; AVX1-NEXT: LBB24_45: ## %cond.load64
5388 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5389 ; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5390 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5391 ; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
5392 ; AVX1-NEXT: je LBB24_48
5393 ; AVX1-NEXT: LBB24_47: ## %cond.load67
5394 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5395 ; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5396 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5397 ; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
5398 ; AVX1-NEXT: je LBB24_50
5399 ; AVX1-NEXT: LBB24_49: ## %cond.load70
5400 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5401 ; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5402 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5403 ; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
5404 ; AVX1-NEXT: je LBB24_52
5405 ; AVX1-NEXT: LBB24_51: ## %cond.load73
5406 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5407 ; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5408 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5409 ; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000
5410 ; AVX1-NEXT: je LBB24_54
5411 ; AVX1-NEXT: LBB24_53: ## %cond.load76
5412 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5413 ; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5414 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5415 ; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000
5416 ; AVX1-NEXT: je LBB24_56
5417 ; AVX1-NEXT: LBB24_55: ## %cond.load79
5418 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5419 ; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5420 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5421 ; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
5422 ; AVX1-NEXT: je LBB24_58
5423 ; AVX1-NEXT: LBB24_57: ## %cond.load82
5424 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5425 ; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5426 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5427 ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000
5428 ; AVX1-NEXT: je LBB24_60
5429 ; AVX1-NEXT: LBB24_59: ## %cond.load85
5430 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5431 ; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5432 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5433 ; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5434 ; AVX1-NEXT: je LBB24_62
5435 ; AVX1-NEXT: LBB24_61: ## %cond.load88
5436 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5437 ; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5438 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5439 ; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5440 ; AVX1-NEXT: je LBB24_64
5441 ; AVX1-NEXT: LBB24_63: ## %cond.load91
5442 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
5443 ; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
5444 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5445 ; AVX1-NEXT: vmovaps %ymm1, %ymm0
5448 ; AVX2-LABEL: load_v32i8_v32i8:
5450 ; AVX2-NEXT: vpmovmskb %ymm0, %eax
5451 ; AVX2-NEXT: testb $1, %al
5452 ; AVX2-NEXT: jne LBB24_1
5453 ; AVX2-NEXT: ## %bb.2: ## %else
5454 ; AVX2-NEXT: testb $2, %al
5455 ; AVX2-NEXT: jne LBB24_3
5456 ; AVX2-NEXT: LBB24_4: ## %else2
5457 ; AVX2-NEXT: testb $4, %al
5458 ; AVX2-NEXT: jne LBB24_5
5459 ; AVX2-NEXT: LBB24_6: ## %else5
5460 ; AVX2-NEXT: testb $8, %al
5461 ; AVX2-NEXT: jne LBB24_7
5462 ; AVX2-NEXT: LBB24_8: ## %else8
5463 ; AVX2-NEXT: testb $16, %al
5464 ; AVX2-NEXT: jne LBB24_9
5465 ; AVX2-NEXT: LBB24_10: ## %else11
5466 ; AVX2-NEXT: testb $32, %al
5467 ; AVX2-NEXT: jne LBB24_11
5468 ; AVX2-NEXT: LBB24_12: ## %else14
5469 ; AVX2-NEXT: testb $64, %al
5470 ; AVX2-NEXT: jne LBB24_13
5471 ; AVX2-NEXT: LBB24_14: ## %else17
5472 ; AVX2-NEXT: testb %al, %al
5473 ; AVX2-NEXT: js LBB24_15
5474 ; AVX2-NEXT: LBB24_16: ## %else20
5475 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
5476 ; AVX2-NEXT: jne LBB24_17
5477 ; AVX2-NEXT: LBB24_18: ## %else23
5478 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
5479 ; AVX2-NEXT: jne LBB24_19
5480 ; AVX2-NEXT: LBB24_20: ## %else26
5481 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
5482 ; AVX2-NEXT: jne LBB24_21
5483 ; AVX2-NEXT: LBB24_22: ## %else29
5484 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
5485 ; AVX2-NEXT: jne LBB24_23
5486 ; AVX2-NEXT: LBB24_24: ## %else32
5487 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
5488 ; AVX2-NEXT: jne LBB24_25
5489 ; AVX2-NEXT: LBB24_26: ## %else35
5490 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
5491 ; AVX2-NEXT: jne LBB24_27
5492 ; AVX2-NEXT: LBB24_28: ## %else38
5493 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
5494 ; AVX2-NEXT: jne LBB24_29
5495 ; AVX2-NEXT: LBB24_30: ## %else41
5496 ; AVX2-NEXT: testw %ax, %ax
5497 ; AVX2-NEXT: js LBB24_31
5498 ; AVX2-NEXT: LBB24_32: ## %else44
5499 ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
5500 ; AVX2-NEXT: jne LBB24_33
5501 ; AVX2-NEXT: LBB24_34: ## %else47
5502 ; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000
5503 ; AVX2-NEXT: jne LBB24_35
5504 ; AVX2-NEXT: LBB24_36: ## %else50
5505 ; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000
5506 ; AVX2-NEXT: jne LBB24_37
5507 ; AVX2-NEXT: LBB24_38: ## %else53
5508 ; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000
5509 ; AVX2-NEXT: jne LBB24_39
5510 ; AVX2-NEXT: LBB24_40: ## %else56
5511 ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
5512 ; AVX2-NEXT: jne LBB24_41
5513 ; AVX2-NEXT: LBB24_42: ## %else59
5514 ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000
5515 ; AVX2-NEXT: jne LBB24_43
5516 ; AVX2-NEXT: LBB24_44: ## %else62
5517 ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
5518 ; AVX2-NEXT: jne LBB24_45
5519 ; AVX2-NEXT: LBB24_46: ## %else65
5520 ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
5521 ; AVX2-NEXT: jne LBB24_47
5522 ; AVX2-NEXT: LBB24_48: ## %else68
5523 ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
5524 ; AVX2-NEXT: jne LBB24_49
5525 ; AVX2-NEXT: LBB24_50: ## %else71
5526 ; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000
5527 ; AVX2-NEXT: jne LBB24_51
5528 ; AVX2-NEXT: LBB24_52: ## %else74
5529 ; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000
5530 ; AVX2-NEXT: jne LBB24_53
5531 ; AVX2-NEXT: LBB24_54: ## %else77
5532 ; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000
5533 ; AVX2-NEXT: jne LBB24_55
5534 ; AVX2-NEXT: LBB24_56: ## %else80
5535 ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
5536 ; AVX2-NEXT: jne LBB24_57
5537 ; AVX2-NEXT: LBB24_58: ## %else83
5538 ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000
5539 ; AVX2-NEXT: jne LBB24_59
5540 ; AVX2-NEXT: LBB24_60: ## %else86
5541 ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5542 ; AVX2-NEXT: jne LBB24_61
5543 ; AVX2-NEXT: LBB24_62: ## %else89
5544 ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5545 ; AVX2-NEXT: jne LBB24_63
5546 ; AVX2-NEXT: LBB24_64: ## %else92
5547 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
5549 ; AVX2-NEXT: LBB24_1: ## %cond.load
5550 ; AVX2-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
5551 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5552 ; AVX2-NEXT: testb $2, %al
5553 ; AVX2-NEXT: je LBB24_4
5554 ; AVX2-NEXT: LBB24_3: ## %cond.load1
5555 ; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5556 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5557 ; AVX2-NEXT: testb $4, %al
5558 ; AVX2-NEXT: je LBB24_6
5559 ; AVX2-NEXT: LBB24_5: ## %cond.load4
5560 ; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5561 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5562 ; AVX2-NEXT: testb $8, %al
5563 ; AVX2-NEXT: je LBB24_8
5564 ; AVX2-NEXT: LBB24_7: ## %cond.load7
5565 ; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5566 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5567 ; AVX2-NEXT: testb $16, %al
5568 ; AVX2-NEXT: je LBB24_10
5569 ; AVX2-NEXT: LBB24_9: ## %cond.load10
5570 ; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5571 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5572 ; AVX2-NEXT: testb $32, %al
5573 ; AVX2-NEXT: je LBB24_12
5574 ; AVX2-NEXT: LBB24_11: ## %cond.load13
5575 ; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5576 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5577 ; AVX2-NEXT: testb $64, %al
5578 ; AVX2-NEXT: je LBB24_14
5579 ; AVX2-NEXT: LBB24_13: ## %cond.load16
5580 ; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5581 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5582 ; AVX2-NEXT: testb %al, %al
5583 ; AVX2-NEXT: jns LBB24_16
5584 ; AVX2-NEXT: LBB24_15: ## %cond.load19
5585 ; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5586 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5587 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100
5588 ; AVX2-NEXT: je LBB24_18
5589 ; AVX2-NEXT: LBB24_17: ## %cond.load22
5590 ; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5591 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5592 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200
5593 ; AVX2-NEXT: je LBB24_20
5594 ; AVX2-NEXT: LBB24_19: ## %cond.load25
5595 ; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5596 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5597 ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
5598 ; AVX2-NEXT: je LBB24_22
5599 ; AVX2-NEXT: LBB24_21: ## %cond.load28
5600 ; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5601 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5602 ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
5603 ; AVX2-NEXT: je LBB24_24
5604 ; AVX2-NEXT: LBB24_23: ## %cond.load31
5605 ; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5606 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5607 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
5608 ; AVX2-NEXT: je LBB24_26
5609 ; AVX2-NEXT: LBB24_25: ## %cond.load34
5610 ; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5611 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5612 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
5613 ; AVX2-NEXT: je LBB24_28
5614 ; AVX2-NEXT: LBB24_27: ## %cond.load37
5615 ; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5616 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5617 ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
5618 ; AVX2-NEXT: je LBB24_30
5619 ; AVX2-NEXT: LBB24_29: ## %cond.load40
5620 ; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5621 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5622 ; AVX2-NEXT: testw %ax, %ax
5623 ; AVX2-NEXT: jns LBB24_32
5624 ; AVX2-NEXT: LBB24_31: ## %cond.load43
5625 ; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5626 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5627 ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
5628 ; AVX2-NEXT: je LBB24_34
5629 ; AVX2-NEXT: LBB24_33: ## %cond.load46
5630 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5631 ; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5632 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5633 ; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000
5634 ; AVX2-NEXT: je LBB24_36
5635 ; AVX2-NEXT: LBB24_35: ## %cond.load49
5636 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5637 ; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5638 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5639 ; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000
5640 ; AVX2-NEXT: je LBB24_38
5641 ; AVX2-NEXT: LBB24_37: ## %cond.load52
5642 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5643 ; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5644 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5645 ; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000
5646 ; AVX2-NEXT: je LBB24_40
5647 ; AVX2-NEXT: LBB24_39: ## %cond.load55
5648 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5649 ; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5650 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5651 ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
5652 ; AVX2-NEXT: je LBB24_42
5653 ; AVX2-NEXT: LBB24_41: ## %cond.load58
5654 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5655 ; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5656 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5657 ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000
5658 ; AVX2-NEXT: je LBB24_44
5659 ; AVX2-NEXT: LBB24_43: ## %cond.load61
5660 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5661 ; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5662 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5663 ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
5664 ; AVX2-NEXT: je LBB24_46
5665 ; AVX2-NEXT: LBB24_45: ## %cond.load64
5666 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5667 ; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5668 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5669 ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
5670 ; AVX2-NEXT: je LBB24_48
5671 ; AVX2-NEXT: LBB24_47: ## %cond.load67
5672 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5673 ; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5674 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5675 ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
5676 ; AVX2-NEXT: je LBB24_50
5677 ; AVX2-NEXT: LBB24_49: ## %cond.load70
5678 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5679 ; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5680 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5681 ; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000
5682 ; AVX2-NEXT: je LBB24_52
5683 ; AVX2-NEXT: LBB24_51: ## %cond.load73
5684 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5685 ; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5686 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5687 ; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000
5688 ; AVX2-NEXT: je LBB24_54
5689 ; AVX2-NEXT: LBB24_53: ## %cond.load76
5690 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5691 ; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5692 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5693 ; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000
5694 ; AVX2-NEXT: je LBB24_56
5695 ; AVX2-NEXT: LBB24_55: ## %cond.load79
5696 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5697 ; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5698 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5699 ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
5700 ; AVX2-NEXT: je LBB24_58
5701 ; AVX2-NEXT: LBB24_57: ## %cond.load82
5702 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5703 ; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5704 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5705 ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000
5706 ; AVX2-NEXT: je LBB24_60
5707 ; AVX2-NEXT: LBB24_59: ## %cond.load85
5708 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5709 ; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5710 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5711 ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5712 ; AVX2-NEXT: je LBB24_62
5713 ; AVX2-NEXT: LBB24_61: ## %cond.load88
5714 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5715 ; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5716 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5717 ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5718 ; AVX2-NEXT: je LBB24_64
5719 ; AVX2-NEXT: LBB24_63: ## %cond.load91
5720 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
5721 ; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
5722 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5723 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0
5726 ; AVX512F-LABEL: load_v32i8_v32i8:
5727 ; AVX512F: ## %bb.0:
5728 ; AVX512F-NEXT: vpmovmskb %ymm0, %eax
5729 ; AVX512F-NEXT: testb $1, %al
5730 ; AVX512F-NEXT: jne LBB24_1
5731 ; AVX512F-NEXT: ## %bb.2: ## %else
5732 ; AVX512F-NEXT: testb $2, %al
5733 ; AVX512F-NEXT: jne LBB24_3
5734 ; AVX512F-NEXT: LBB24_4: ## %else2
5735 ; AVX512F-NEXT: testb $4, %al
5736 ; AVX512F-NEXT: jne LBB24_5
5737 ; AVX512F-NEXT: LBB24_6: ## %else5
5738 ; AVX512F-NEXT: testb $8, %al
5739 ; AVX512F-NEXT: jne LBB24_7
5740 ; AVX512F-NEXT: LBB24_8: ## %else8
5741 ; AVX512F-NEXT: testb $16, %al
5742 ; AVX512F-NEXT: jne LBB24_9
5743 ; AVX512F-NEXT: LBB24_10: ## %else11
5744 ; AVX512F-NEXT: testb $32, %al
5745 ; AVX512F-NEXT: jne LBB24_11
5746 ; AVX512F-NEXT: LBB24_12: ## %else14
5747 ; AVX512F-NEXT: testb $64, %al
5748 ; AVX512F-NEXT: jne LBB24_13
5749 ; AVX512F-NEXT: LBB24_14: ## %else17
5750 ; AVX512F-NEXT: testb %al, %al
5751 ; AVX512F-NEXT: js LBB24_15
5752 ; AVX512F-NEXT: LBB24_16: ## %else20
5753 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
5754 ; AVX512F-NEXT: jne LBB24_17
5755 ; AVX512F-NEXT: LBB24_18: ## %else23
5756 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
5757 ; AVX512F-NEXT: jne LBB24_19
5758 ; AVX512F-NEXT: LBB24_20: ## %else26
5759 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
5760 ; AVX512F-NEXT: jne LBB24_21
5761 ; AVX512F-NEXT: LBB24_22: ## %else29
5762 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
5763 ; AVX512F-NEXT: jne LBB24_23
5764 ; AVX512F-NEXT: LBB24_24: ## %else32
5765 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
5766 ; AVX512F-NEXT: jne LBB24_25
5767 ; AVX512F-NEXT: LBB24_26: ## %else35
5768 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
5769 ; AVX512F-NEXT: jne LBB24_27
5770 ; AVX512F-NEXT: LBB24_28: ## %else38
5771 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
5772 ; AVX512F-NEXT: jne LBB24_29
5773 ; AVX512F-NEXT: LBB24_30: ## %else41
5774 ; AVX512F-NEXT: testw %ax, %ax
5775 ; AVX512F-NEXT: js LBB24_31
5776 ; AVX512F-NEXT: LBB24_32: ## %else44
5777 ; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000
5778 ; AVX512F-NEXT: jne LBB24_33
5779 ; AVX512F-NEXT: LBB24_34: ## %else47
5780 ; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000
5781 ; AVX512F-NEXT: jne LBB24_35
5782 ; AVX512F-NEXT: LBB24_36: ## %else50
5783 ; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000
5784 ; AVX512F-NEXT: jne LBB24_37
5785 ; AVX512F-NEXT: LBB24_38: ## %else53
5786 ; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000
5787 ; AVX512F-NEXT: jne LBB24_39
5788 ; AVX512F-NEXT: LBB24_40: ## %else56
5789 ; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000
5790 ; AVX512F-NEXT: jne LBB24_41
5791 ; AVX512F-NEXT: LBB24_42: ## %else59
5792 ; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000
5793 ; AVX512F-NEXT: jne LBB24_43
5794 ; AVX512F-NEXT: LBB24_44: ## %else62
5795 ; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000
5796 ; AVX512F-NEXT: jne LBB24_45
5797 ; AVX512F-NEXT: LBB24_46: ## %else65
5798 ; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000
5799 ; AVX512F-NEXT: jne LBB24_47
5800 ; AVX512F-NEXT: LBB24_48: ## %else68
5801 ; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000
5802 ; AVX512F-NEXT: jne LBB24_49
5803 ; AVX512F-NEXT: LBB24_50: ## %else71
5804 ; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000
5805 ; AVX512F-NEXT: jne LBB24_51
5806 ; AVX512F-NEXT: LBB24_52: ## %else74
5807 ; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000
5808 ; AVX512F-NEXT: jne LBB24_53
5809 ; AVX512F-NEXT: LBB24_54: ## %else77
5810 ; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000
5811 ; AVX512F-NEXT: jne LBB24_55
5812 ; AVX512F-NEXT: LBB24_56: ## %else80
5813 ; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000
5814 ; AVX512F-NEXT: jne LBB24_57
5815 ; AVX512F-NEXT: LBB24_58: ## %else83
5816 ; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000
5817 ; AVX512F-NEXT: jne LBB24_59
5818 ; AVX512F-NEXT: LBB24_60: ## %else86
5819 ; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5820 ; AVX512F-NEXT: jne LBB24_61
5821 ; AVX512F-NEXT: LBB24_62: ## %else89
5822 ; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5823 ; AVX512F-NEXT: jne LBB24_63
5824 ; AVX512F-NEXT: LBB24_64: ## %else92
5825 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
5826 ; AVX512F-NEXT: retq
5827 ; AVX512F-NEXT: LBB24_1: ## %cond.load
5828 ; AVX512F-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
5829 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5830 ; AVX512F-NEXT: testb $2, %al
5831 ; AVX512F-NEXT: je LBB24_4
5832 ; AVX512F-NEXT: LBB24_3: ## %cond.load1
5833 ; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
5834 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5835 ; AVX512F-NEXT: testb $4, %al
5836 ; AVX512F-NEXT: je LBB24_6
5837 ; AVX512F-NEXT: LBB24_5: ## %cond.load4
5838 ; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
5839 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5840 ; AVX512F-NEXT: testb $8, %al
5841 ; AVX512F-NEXT: je LBB24_8
5842 ; AVX512F-NEXT: LBB24_7: ## %cond.load7
5843 ; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
5844 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5845 ; AVX512F-NEXT: testb $16, %al
5846 ; AVX512F-NEXT: je LBB24_10
5847 ; AVX512F-NEXT: LBB24_9: ## %cond.load10
5848 ; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
5849 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5850 ; AVX512F-NEXT: testb $32, %al
5851 ; AVX512F-NEXT: je LBB24_12
5852 ; AVX512F-NEXT: LBB24_11: ## %cond.load13
5853 ; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
5854 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5855 ; AVX512F-NEXT: testb $64, %al
5856 ; AVX512F-NEXT: je LBB24_14
5857 ; AVX512F-NEXT: LBB24_13: ## %cond.load16
5858 ; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
5859 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5860 ; AVX512F-NEXT: testb %al, %al
5861 ; AVX512F-NEXT: jns LBB24_16
5862 ; AVX512F-NEXT: LBB24_15: ## %cond.load19
5863 ; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
5864 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5865 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100
5866 ; AVX512F-NEXT: je LBB24_18
5867 ; AVX512F-NEXT: LBB24_17: ## %cond.load22
5868 ; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
5869 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5870 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200
5871 ; AVX512F-NEXT: je LBB24_20
5872 ; AVX512F-NEXT: LBB24_19: ## %cond.load25
5873 ; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
5874 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5875 ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400
5876 ; AVX512F-NEXT: je LBB24_22
5877 ; AVX512F-NEXT: LBB24_21: ## %cond.load28
5878 ; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
5879 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5880 ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800
5881 ; AVX512F-NEXT: je LBB24_24
5882 ; AVX512F-NEXT: LBB24_23: ## %cond.load31
5883 ; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
5884 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5885 ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000
5886 ; AVX512F-NEXT: je LBB24_26
5887 ; AVX512F-NEXT: LBB24_25: ## %cond.load34
5888 ; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
5889 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5890 ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000
5891 ; AVX512F-NEXT: je LBB24_28
5892 ; AVX512F-NEXT: LBB24_27: ## %cond.load37
5893 ; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
5894 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5895 ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000
5896 ; AVX512F-NEXT: je LBB24_30
5897 ; AVX512F-NEXT: LBB24_29: ## %cond.load40
5898 ; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
5899 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5900 ; AVX512F-NEXT: testw %ax, %ax
5901 ; AVX512F-NEXT: jns LBB24_32
5902 ; AVX512F-NEXT: LBB24_31: ## %cond.load43
5903 ; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
5904 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5905 ; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000
5906 ; AVX512F-NEXT: je LBB24_34
5907 ; AVX512F-NEXT: LBB24_33: ## %cond.load46
5908 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5909 ; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
5910 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5911 ; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000
5912 ; AVX512F-NEXT: je LBB24_36
5913 ; AVX512F-NEXT: LBB24_35: ## %cond.load49
5914 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5915 ; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
5916 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5917 ; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000
5918 ; AVX512F-NEXT: je LBB24_38
5919 ; AVX512F-NEXT: LBB24_37: ## %cond.load52
5920 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5921 ; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
5922 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5923 ; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000
5924 ; AVX512F-NEXT: je LBB24_40
5925 ; AVX512F-NEXT: LBB24_39: ## %cond.load55
5926 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5927 ; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
5928 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5929 ; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000
5930 ; AVX512F-NEXT: je LBB24_42
5931 ; AVX512F-NEXT: LBB24_41: ## %cond.load58
5932 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5933 ; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
5934 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5935 ; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000
5936 ; AVX512F-NEXT: je LBB24_44
5937 ; AVX512F-NEXT: LBB24_43: ## %cond.load61
5938 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5939 ; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
5940 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5941 ; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000
5942 ; AVX512F-NEXT: je LBB24_46
5943 ; AVX512F-NEXT: LBB24_45: ## %cond.load64
5944 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5945 ; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
5946 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5947 ; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000
5948 ; AVX512F-NEXT: je LBB24_48
5949 ; AVX512F-NEXT: LBB24_47: ## %cond.load67
5950 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5951 ; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
5952 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5953 ; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000
5954 ; AVX512F-NEXT: je LBB24_50
5955 ; AVX512F-NEXT: LBB24_49: ## %cond.load70
5956 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5957 ; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
5958 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5959 ; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000
5960 ; AVX512F-NEXT: je LBB24_52
5961 ; AVX512F-NEXT: LBB24_51: ## %cond.load73
5962 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5963 ; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
5964 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5965 ; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000
5966 ; AVX512F-NEXT: je LBB24_54
5967 ; AVX512F-NEXT: LBB24_53: ## %cond.load76
5968 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5969 ; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
5970 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5971 ; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000
5972 ; AVX512F-NEXT: je LBB24_56
5973 ; AVX512F-NEXT: LBB24_55: ## %cond.load79
5974 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5975 ; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
5976 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5977 ; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000
5978 ; AVX512F-NEXT: je LBB24_58
5979 ; AVX512F-NEXT: LBB24_57: ## %cond.load82
5980 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5981 ; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
5982 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5983 ; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000
5984 ; AVX512F-NEXT: je LBB24_60
5985 ; AVX512F-NEXT: LBB24_59: ## %cond.load85
5986 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5987 ; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
5988 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5989 ; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000
5990 ; AVX512F-NEXT: je LBB24_62
5991 ; AVX512F-NEXT: LBB24_61: ## %cond.load88
5992 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5993 ; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
5994 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
5995 ; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
5996 ; AVX512F-NEXT: je LBB24_64
5997 ; AVX512F-NEXT: LBB24_63: ## %cond.load91
5998 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
5999 ; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
6000 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6001 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
6002 ; AVX512F-NEXT: retq
6004 ; AVX512VLDQ-LABEL: load_v32i8_v32i8:
6005 ; AVX512VLDQ: ## %bb.0:
6006 ; AVX512VLDQ-NEXT: vpmovmskb %ymm0, %eax
6007 ; AVX512VLDQ-NEXT: testb $1, %al
6008 ; AVX512VLDQ-NEXT: jne LBB24_1
6009 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else
6010 ; AVX512VLDQ-NEXT: testb $2, %al
6011 ; AVX512VLDQ-NEXT: jne LBB24_3
6012 ; AVX512VLDQ-NEXT: LBB24_4: ## %else2
6013 ; AVX512VLDQ-NEXT: testb $4, %al
6014 ; AVX512VLDQ-NEXT: jne LBB24_5
6015 ; AVX512VLDQ-NEXT: LBB24_6: ## %else5
6016 ; AVX512VLDQ-NEXT: testb $8, %al
6017 ; AVX512VLDQ-NEXT: jne LBB24_7
6018 ; AVX512VLDQ-NEXT: LBB24_8: ## %else8
6019 ; AVX512VLDQ-NEXT: testb $16, %al
6020 ; AVX512VLDQ-NEXT: jne LBB24_9
6021 ; AVX512VLDQ-NEXT: LBB24_10: ## %else11
6022 ; AVX512VLDQ-NEXT: testb $32, %al
6023 ; AVX512VLDQ-NEXT: jne LBB24_11
6024 ; AVX512VLDQ-NEXT: LBB24_12: ## %else14
6025 ; AVX512VLDQ-NEXT: testb $64, %al
6026 ; AVX512VLDQ-NEXT: jne LBB24_13
6027 ; AVX512VLDQ-NEXT: LBB24_14: ## %else17
6028 ; AVX512VLDQ-NEXT: testb %al, %al
6029 ; AVX512VLDQ-NEXT: js LBB24_15
6030 ; AVX512VLDQ-NEXT: LBB24_16: ## %else20
6031 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
6032 ; AVX512VLDQ-NEXT: jne LBB24_17
6033 ; AVX512VLDQ-NEXT: LBB24_18: ## %else23
6034 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
6035 ; AVX512VLDQ-NEXT: jne LBB24_19
6036 ; AVX512VLDQ-NEXT: LBB24_20: ## %else26
6037 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
6038 ; AVX512VLDQ-NEXT: jne LBB24_21
6039 ; AVX512VLDQ-NEXT: LBB24_22: ## %else29
6040 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
6041 ; AVX512VLDQ-NEXT: jne LBB24_23
6042 ; AVX512VLDQ-NEXT: LBB24_24: ## %else32
6043 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
6044 ; AVX512VLDQ-NEXT: jne LBB24_25
6045 ; AVX512VLDQ-NEXT: LBB24_26: ## %else35
6046 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
6047 ; AVX512VLDQ-NEXT: jne LBB24_27
6048 ; AVX512VLDQ-NEXT: LBB24_28: ## %else38
6049 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
6050 ; AVX512VLDQ-NEXT: jne LBB24_29
6051 ; AVX512VLDQ-NEXT: LBB24_30: ## %else41
6052 ; AVX512VLDQ-NEXT: testw %ax, %ax
6053 ; AVX512VLDQ-NEXT: js LBB24_31
6054 ; AVX512VLDQ-NEXT: LBB24_32: ## %else44
6055 ; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000
6056 ; AVX512VLDQ-NEXT: jne LBB24_33
6057 ; AVX512VLDQ-NEXT: LBB24_34: ## %else47
6058 ; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000
6059 ; AVX512VLDQ-NEXT: jne LBB24_35
6060 ; AVX512VLDQ-NEXT: LBB24_36: ## %else50
6061 ; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000
6062 ; AVX512VLDQ-NEXT: jne LBB24_37
6063 ; AVX512VLDQ-NEXT: LBB24_38: ## %else53
6064 ; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000
6065 ; AVX512VLDQ-NEXT: jne LBB24_39
6066 ; AVX512VLDQ-NEXT: LBB24_40: ## %else56
6067 ; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000
6068 ; AVX512VLDQ-NEXT: jne LBB24_41
6069 ; AVX512VLDQ-NEXT: LBB24_42: ## %else59
6070 ; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000
6071 ; AVX512VLDQ-NEXT: jne LBB24_43
6072 ; AVX512VLDQ-NEXT: LBB24_44: ## %else62
6073 ; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000
6074 ; AVX512VLDQ-NEXT: jne LBB24_45
6075 ; AVX512VLDQ-NEXT: LBB24_46: ## %else65
6076 ; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000
6077 ; AVX512VLDQ-NEXT: jne LBB24_47
6078 ; AVX512VLDQ-NEXT: LBB24_48: ## %else68
6079 ; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000
6080 ; AVX512VLDQ-NEXT: jne LBB24_49
6081 ; AVX512VLDQ-NEXT: LBB24_50: ## %else71
6082 ; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000
6083 ; AVX512VLDQ-NEXT: jne LBB24_51
6084 ; AVX512VLDQ-NEXT: LBB24_52: ## %else74
6085 ; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000
6086 ; AVX512VLDQ-NEXT: jne LBB24_53
6087 ; AVX512VLDQ-NEXT: LBB24_54: ## %else77
6088 ; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000
6089 ; AVX512VLDQ-NEXT: jne LBB24_55
6090 ; AVX512VLDQ-NEXT: LBB24_56: ## %else80
6091 ; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000
6092 ; AVX512VLDQ-NEXT: jne LBB24_57
6093 ; AVX512VLDQ-NEXT: LBB24_58: ## %else83
6094 ; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000
6095 ; AVX512VLDQ-NEXT: jne LBB24_59
6096 ; AVX512VLDQ-NEXT: LBB24_60: ## %else86
6097 ; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000
6098 ; AVX512VLDQ-NEXT: jne LBB24_61
6099 ; AVX512VLDQ-NEXT: LBB24_62: ## %else89
6100 ; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
6101 ; AVX512VLDQ-NEXT: jne LBB24_63
6102 ; AVX512VLDQ-NEXT: LBB24_64: ## %else92
6103 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
6104 ; AVX512VLDQ-NEXT: retq
6105 ; AVX512VLDQ-NEXT: LBB24_1: ## %cond.load
6106 ; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm0
6107 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6108 ; AVX512VLDQ-NEXT: testb $2, %al
6109 ; AVX512VLDQ-NEXT: je LBB24_4
6110 ; AVX512VLDQ-NEXT: LBB24_3: ## %cond.load1
6111 ; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm0
6112 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6113 ; AVX512VLDQ-NEXT: testb $4, %al
6114 ; AVX512VLDQ-NEXT: je LBB24_6
6115 ; AVX512VLDQ-NEXT: LBB24_5: ## %cond.load4
6116 ; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm0
6117 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6118 ; AVX512VLDQ-NEXT: testb $8, %al
6119 ; AVX512VLDQ-NEXT: je LBB24_8
6120 ; AVX512VLDQ-NEXT: LBB24_7: ## %cond.load7
6121 ; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm0
6122 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6123 ; AVX512VLDQ-NEXT: testb $16, %al
6124 ; AVX512VLDQ-NEXT: je LBB24_10
6125 ; AVX512VLDQ-NEXT: LBB24_9: ## %cond.load10
6126 ; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm0
6127 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6128 ; AVX512VLDQ-NEXT: testb $32, %al
6129 ; AVX512VLDQ-NEXT: je LBB24_12
6130 ; AVX512VLDQ-NEXT: LBB24_11: ## %cond.load13
6131 ; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm0
6132 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6133 ; AVX512VLDQ-NEXT: testb $64, %al
6134 ; AVX512VLDQ-NEXT: je LBB24_14
6135 ; AVX512VLDQ-NEXT: LBB24_13: ## %cond.load16
6136 ; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm0
6137 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6138 ; AVX512VLDQ-NEXT: testb %al, %al
6139 ; AVX512VLDQ-NEXT: jns LBB24_16
6140 ; AVX512VLDQ-NEXT: LBB24_15: ## %cond.load19
6141 ; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm0
6142 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6143 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100
6144 ; AVX512VLDQ-NEXT: je LBB24_18
6145 ; AVX512VLDQ-NEXT: LBB24_17: ## %cond.load22
6146 ; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm0
6147 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6148 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200
6149 ; AVX512VLDQ-NEXT: je LBB24_20
6150 ; AVX512VLDQ-NEXT: LBB24_19: ## %cond.load25
6151 ; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm0
6152 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6153 ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400
6154 ; AVX512VLDQ-NEXT: je LBB24_22
6155 ; AVX512VLDQ-NEXT: LBB24_21: ## %cond.load28
6156 ; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm0
6157 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6158 ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800
6159 ; AVX512VLDQ-NEXT: je LBB24_24
6160 ; AVX512VLDQ-NEXT: LBB24_23: ## %cond.load31
6161 ; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm0
6162 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6163 ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000
6164 ; AVX512VLDQ-NEXT: je LBB24_26
6165 ; AVX512VLDQ-NEXT: LBB24_25: ## %cond.load34
6166 ; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm0
6167 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6168 ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000
6169 ; AVX512VLDQ-NEXT: je LBB24_28
6170 ; AVX512VLDQ-NEXT: LBB24_27: ## %cond.load37
6171 ; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm0
6172 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6173 ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000
6174 ; AVX512VLDQ-NEXT: je LBB24_30
6175 ; AVX512VLDQ-NEXT: LBB24_29: ## %cond.load40
6176 ; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm0
6177 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6178 ; AVX512VLDQ-NEXT: testw %ax, %ax
6179 ; AVX512VLDQ-NEXT: jns LBB24_32
6180 ; AVX512VLDQ-NEXT: LBB24_31: ## %cond.load43
6181 ; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm0
6182 ; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6183 ; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000
6184 ; AVX512VLDQ-NEXT: je LBB24_34
6185 ; AVX512VLDQ-NEXT: LBB24_33: ## %cond.load46
6186 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6187 ; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0
6188 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6189 ; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000
6190 ; AVX512VLDQ-NEXT: je LBB24_36
6191 ; AVX512VLDQ-NEXT: LBB24_35: ## %cond.load49
6192 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6193 ; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0
6194 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6195 ; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000
6196 ; AVX512VLDQ-NEXT: je LBB24_38
6197 ; AVX512VLDQ-NEXT: LBB24_37: ## %cond.load52
6198 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6199 ; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0
6200 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6201 ; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000
6202 ; AVX512VLDQ-NEXT: je LBB24_40
6203 ; AVX512VLDQ-NEXT: LBB24_39: ## %cond.load55
6204 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6205 ; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0
6206 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6207 ; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000
6208 ; AVX512VLDQ-NEXT: je LBB24_42
6209 ; AVX512VLDQ-NEXT: LBB24_41: ## %cond.load58
6210 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6211 ; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0
6212 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6213 ; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000
6214 ; AVX512VLDQ-NEXT: je LBB24_44
6215 ; AVX512VLDQ-NEXT: LBB24_43: ## %cond.load61
6216 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6217 ; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0
6218 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6219 ; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000
6220 ; AVX512VLDQ-NEXT: je LBB24_46
6221 ; AVX512VLDQ-NEXT: LBB24_45: ## %cond.load64
6222 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6223 ; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0
6224 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6225 ; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000
6226 ; AVX512VLDQ-NEXT: je LBB24_48
6227 ; AVX512VLDQ-NEXT: LBB24_47: ## %cond.load67
6228 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6229 ; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0
6230 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6231 ; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000
6232 ; AVX512VLDQ-NEXT: je LBB24_50
6233 ; AVX512VLDQ-NEXT: LBB24_49: ## %cond.load70
6234 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6235 ; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0
6236 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6237 ; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000
6238 ; AVX512VLDQ-NEXT: je LBB24_52
6239 ; AVX512VLDQ-NEXT: LBB24_51: ## %cond.load73
6240 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6241 ; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0
6242 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6243 ; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000
6244 ; AVX512VLDQ-NEXT: je LBB24_54
6245 ; AVX512VLDQ-NEXT: LBB24_53: ## %cond.load76
6246 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6247 ; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0
6248 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6249 ; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000
6250 ; AVX512VLDQ-NEXT: je LBB24_56
6251 ; AVX512VLDQ-NEXT: LBB24_55: ## %cond.load79
6252 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6253 ; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0
6254 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6255 ; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000
6256 ; AVX512VLDQ-NEXT: je LBB24_58
6257 ; AVX512VLDQ-NEXT: LBB24_57: ## %cond.load82
6258 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6259 ; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0
6260 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6261 ; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000
6262 ; AVX512VLDQ-NEXT: je LBB24_60
6263 ; AVX512VLDQ-NEXT: LBB24_59: ## %cond.load85
6264 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6265 ; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0
6266 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6267 ; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000
6268 ; AVX512VLDQ-NEXT: je LBB24_62
6269 ; AVX512VLDQ-NEXT: LBB24_61: ## %cond.load88
6270 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6271 ; AVX512VLDQ-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0
6272 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6273 ; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
6274 ; AVX512VLDQ-NEXT: je LBB24_64
6275 ; AVX512VLDQ-NEXT: LBB24_63: ## %cond.load91
6276 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
6277 ; AVX512VLDQ-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0
6278 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
6279 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0
6280 ; AVX512VLDQ-NEXT: retq
6282 ; AVX512VLBW-LABEL: load_v32i8_v32i8:
6283 ; AVX512VLBW: ## %bb.0:
6284 ; AVX512VLBW-NEXT: vpmovb2m %ymm0, %k1
6285 ; AVX512VLBW-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
6286 ; AVX512VLBW-NEXT: retq
6288 ; X86-AVX512-LABEL: load_v32i8_v32i8:
6289 ; X86-AVX512: ## %bb.0:
6290 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6291 ; X86-AVX512-NEXT: vpmovb2m %ymm0, %k1
6292 ; X86-AVX512-NEXT: vpblendmb (%eax), %ymm1, %ymm0 {%k1}
6293 ; X86-AVX512-NEXT: retl
6294 %mask = icmp slt <32 x i8> %trigger, zeroinitializer
6295 %res = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x i8> %dst)
6299 ;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
6301 ; 128-bit FP vectors are supported with AVX.
6303 define <4 x float> @mload_constmask_v4f32(ptr %addr, <4 x float> %dst) {
6304 ; SSE2-LABEL: mload_constmask_v4f32:
6306 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6307 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6308 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6309 ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
6310 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6311 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
6314 ; SSE42-LABEL: mload_constmask_v4f32:
6316 ; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6317 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
6318 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6319 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
6322 ; AVX1OR2-LABEL: mload_constmask_v4f32:
6323 ; AVX1OR2: ## %bb.0:
6324 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
6325 ; AVX1OR2-NEXT: retq
6327 ; AVX512F-LABEL: mload_constmask_v4f32:
6328 ; AVX512F: ## %bb.0:
6329 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
6330 ; AVX512F-NEXT: movw $13, %ax
6331 ; AVX512F-NEXT: kmovw %eax, %k1
6332 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
6333 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
6334 ; AVX512F-NEXT: vzeroupper
6335 ; AVX512F-NEXT: retq
6337 ; AVX512VLDQ-LABEL: mload_constmask_v4f32:
6338 ; AVX512VLDQ: ## %bb.0:
6339 ; AVX512VLDQ-NEXT: movb $13, %al
6340 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6341 ; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1}
6342 ; AVX512VLDQ-NEXT: retq
6344 ; AVX512VLBW-LABEL: mload_constmask_v4f32:
6345 ; AVX512VLBW: ## %bb.0:
6346 ; AVX512VLBW-NEXT: movb $13, %al
6347 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6348 ; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1}
6349 ; AVX512VLBW-NEXT: retq
6351 ; X86-AVX512-LABEL: mload_constmask_v4f32:
6352 ; X86-AVX512: ## %bb.0:
6353 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6354 ; X86-AVX512-NEXT: movb $13, %cl
6355 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6356 ; X86-AVX512-NEXT: vmovups (%eax), %xmm0 {%k1}
6357 ; X86-AVX512-NEXT: retl
6358 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
6359 ret <4 x float> %res
6362 define <4 x float> @mload_constmask_v4f32_all(ptr %addr) {
6363 ; SSE-LABEL: mload_constmask_v4f32_all:
6365 ; SSE-NEXT: movups (%rdi), %xmm0
6368 ; AVX-LABEL: mload_constmask_v4f32_all:
6370 ; AVX-NEXT: vmovups (%rdi), %xmm0
6373 ; X86-AVX512-LABEL: mload_constmask_v4f32_all:
6374 ; X86-AVX512: ## %bb.0:
6375 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6376 ; X86-AVX512-NEXT: vmovups (%eax), %xmm0
6377 ; X86-AVX512-NEXT: retl
6378 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
6379 ret <4 x float> %res
6382 define <2 x double> @mload_constmask_v2f64(ptr %addr, <2 x double> %dst) {
6383 ; SSE-LABEL: mload_constmask_v2f64:
6385 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6388 ; AVX-LABEL: mload_constmask_v2f64:
6390 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6393 ; X86-AVX512-LABEL: mload_constmask_v2f64:
6394 ; X86-AVX512: ## %bb.0:
6395 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6396 ; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6397 ; X86-AVX512-NEXT: retl
6398 %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
6399 ret <2 x double> %res
6402 ; 128-bit integer vectors are supported with AVX2.
6404 define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) {
6405 ; SSE2-LABEL: mload_constmask_v4i32:
6407 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6408 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
6409 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6410 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6411 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
6412 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
6413 ; SSE2-NEXT: movaps %xmm1, %xmm0
6416 ; SSE42-LABEL: mload_constmask_v4i32:
6418 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
6419 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
6420 ; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0
6423 ; AVX1-LABEL: mload_constmask_v4i32:
6425 ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
6426 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
6427 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
6430 ; AVX2-LABEL: mload_constmask_v4i32:
6432 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
6433 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1
6434 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
6437 ; AVX512F-LABEL: mload_constmask_v4i32:
6438 ; AVX512F: ## %bb.0:
6439 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
6440 ; AVX512F-NEXT: movw $14, %ax
6441 ; AVX512F-NEXT: kmovw %eax, %k1
6442 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
6443 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
6444 ; AVX512F-NEXT: vzeroupper
6445 ; AVX512F-NEXT: retq
6447 ; AVX512VLDQ-LABEL: mload_constmask_v4i32:
6448 ; AVX512VLDQ: ## %bb.0:
6449 ; AVX512VLDQ-NEXT: movb $14, %al
6450 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6451 ; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
6452 ; AVX512VLDQ-NEXT: retq
6454 ; AVX512VLBW-LABEL: mload_constmask_v4i32:
6455 ; AVX512VLBW: ## %bb.0:
6456 ; AVX512VLBW-NEXT: movb $14, %al
6457 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6458 ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
6459 ; AVX512VLBW-NEXT: retq
6461 ; X86-AVX512-LABEL: mload_constmask_v4i32:
6462 ; X86-AVX512: ## %bb.0:
6463 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6464 ; X86-AVX512-NEXT: movb $14, %cl
6465 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6466 ; X86-AVX512-NEXT: vmovdqu32 (%eax), %xmm0 {%k1}
6467 ; X86-AVX512-NEXT: retl
6468 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
6472 define <2 x i64> @mload_constmask_v2i64(ptr %addr, <2 x i64> %dst) {
6473 ; SSE2-LABEL: mload_constmask_v2i64:
6475 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
6476 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6479 ; SSE42-LABEL: mload_constmask_v2i64:
6481 ; SSE42-NEXT: pinsrq $1, 8(%rdi), %xmm0
6484 ; AVX-LABEL: mload_constmask_v2i64:
6486 ; AVX-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0
6489 ; X86-AVX512-LABEL: mload_constmask_v2i64:
6490 ; X86-AVX512: ## %bb.0:
6491 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6492 ; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
6493 ; X86-AVX512-NEXT: retl
6494 %res = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
6498 ; 256-bit FP vectors are supported with AVX.
6500 define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) {
6501 ; SSE2-LABEL: mload_constmask_v8f32:
6503 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6504 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6505 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
6506 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6507 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
6508 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
6509 ; SSE2-NEXT: movaps %xmm2, %xmm0
6512 ; SSE42-LABEL: mload_constmask_v8f32:
6514 ; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6515 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6516 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
6517 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
6520 ; AVX1OR2-LABEL: mload_constmask_v8f32:
6521 ; AVX1OR2: ## %bb.0:
6522 ; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
6523 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
6524 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
6525 ; AVX1OR2-NEXT: retq
6527 ; AVX512F-LABEL: mload_constmask_v8f32:
6528 ; AVX512F: ## %bb.0:
6529 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6530 ; AVX512F-NEXT: movw $7, %ax
6531 ; AVX512F-NEXT: kmovw %eax, %k1
6532 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
6533 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6534 ; AVX512F-NEXT: retq
6536 ; AVX512VLDQ-LABEL: mload_constmask_v8f32:
6537 ; AVX512VLDQ: ## %bb.0:
6538 ; AVX512VLDQ-NEXT: movb $7, %al
6539 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6540 ; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1}
6541 ; AVX512VLDQ-NEXT: retq
6543 ; AVX512VLBW-LABEL: mload_constmask_v8f32:
6544 ; AVX512VLBW: ## %bb.0:
6545 ; AVX512VLBW-NEXT: movb $7, %al
6546 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6547 ; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1}
6548 ; AVX512VLBW-NEXT: retq
6550 ; X86-AVX512-LABEL: mload_constmask_v8f32:
6551 ; X86-AVX512: ## %bb.0:
6552 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6553 ; X86-AVX512-NEXT: movb $7, %cl
6554 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6555 ; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1}
6556 ; X86-AVX512-NEXT: retl
6557 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
6558 ret <8 x float> %res
6561 define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) {
6562 ; SSE2-LABEL: mload_constmask_v8f32_zero:
6564 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6565 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
6566 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
6567 ; SSE2-NEXT: xorps %xmm1, %xmm1
6570 ; SSE42-LABEL: mload_constmask_v8f32_zero:
6572 ; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
6573 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero
6574 ; SSE42-NEXT: xorps %xmm1, %xmm1
6577 ; AVX1OR2-LABEL: mload_constmask_v8f32_zero:
6578 ; AVX1OR2: ## %bb.0:
6579 ; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0]
6580 ; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
6581 ; AVX1OR2-NEXT: retq
6583 ; AVX512F-LABEL: mload_constmask_v8f32_zero:
6584 ; AVX512F: ## %bb.0:
6585 ; AVX512F-NEXT: movw $7, %ax
6586 ; AVX512F-NEXT: kmovw %eax, %k1
6587 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
6588 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6589 ; AVX512F-NEXT: retq
6591 ; AVX512VLDQ-LABEL: mload_constmask_v8f32_zero:
6592 ; AVX512VLDQ: ## %bb.0:
6593 ; AVX512VLDQ-NEXT: movb $7, %al
6594 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6595 ; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
6596 ; AVX512VLDQ-NEXT: retq
6598 ; AVX512VLBW-LABEL: mload_constmask_v8f32_zero:
6599 ; AVX512VLBW: ## %bb.0:
6600 ; AVX512VLBW-NEXT: movb $7, %al
6601 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6602 ; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
6603 ; AVX512VLBW-NEXT: retq
6605 ; X86-AVX512-LABEL: mload_constmask_v8f32_zero:
6606 ; X86-AVX512: ## %bb.0:
6607 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6608 ; X86-AVX512-NEXT: movb $7, %cl
6609 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6610 ; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1} {z}
6611 ; X86-AVX512-NEXT: retl
6612 %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer)
6613 ret <8 x float> %res
6616 define <4 x double> @mload_constmask_v4f64(ptr %addr, <4 x double> %dst) {
6617 ; SSE-LABEL: mload_constmask_v4f64:
6619 ; SSE-NEXT: movups (%rdi), %xmm0
6620 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
6623 ; AVX1OR2-LABEL: mload_constmask_v4f64:
6624 ; AVX1OR2: ## %bb.0:
6625 ; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
6626 ; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
6627 ; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
6628 ; AVX1OR2-NEXT: retq
6630 ; AVX512F-LABEL: mload_constmask_v4f64:
6631 ; AVX512F: ## %bb.0:
6632 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6633 ; AVX512F-NEXT: movb $7, %al
6634 ; AVX512F-NEXT: kmovw %eax, %k1
6635 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6636 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6637 ; AVX512F-NEXT: retq
6639 ; AVX512VLDQ-LABEL: mload_constmask_v4f64:
6640 ; AVX512VLDQ: ## %bb.0:
6641 ; AVX512VLDQ-NEXT: movb $7, %al
6642 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6643 ; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1}
6644 ; AVX512VLDQ-NEXT: retq
6646 ; AVX512VLBW-LABEL: mload_constmask_v4f64:
6647 ; AVX512VLBW: ## %bb.0:
6648 ; AVX512VLBW-NEXT: movb $7, %al
6649 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6650 ; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1}
6651 ; AVX512VLBW-NEXT: retq
6653 ; X86-AVX512-LABEL: mload_constmask_v4f64:
6654 ; X86-AVX512: ## %bb.0:
6655 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6656 ; X86-AVX512-NEXT: movb $7, %cl
6657 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6658 ; X86-AVX512-NEXT: vmovupd (%eax), %ymm0 {%k1}
6659 ; X86-AVX512-NEXT: retl
6660 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
6661 ret <4 x double> %res
6664 ; 256-bit integer vectors are supported with AVX2.
6666 define <8 x i32> @mload_constmask_v8i32(ptr %addr, <8 x i32> %dst) {
6667 ; SSE2-LABEL: mload_constmask_v8i32:
6669 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6670 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
6671 ; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
6672 ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
6673 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0]
6674 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2]
6675 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6676 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6677 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
6678 ; SSE2-NEXT: movaps %xmm2, %xmm0
6681 ; SSE42-LABEL: mload_constmask_v8i32:
6683 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0
6684 ; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0
6685 ; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0
6686 ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1
6689 ; AVX1OR2-LABEL: mload_constmask_v8i32:
6690 ; AVX1OR2: ## %bb.0:
6691 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
6692 ; AVX1OR2-NEXT: retq
6694 ; AVX512F-LABEL: mload_constmask_v8i32:
6695 ; AVX512F: ## %bb.0:
6696 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6697 ; AVX512F-NEXT: movw $135, %ax
6698 ; AVX512F-NEXT: kmovw %eax, %k1
6699 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
6700 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6701 ; AVX512F-NEXT: retq
6703 ; AVX512VLDQ-LABEL: mload_constmask_v8i32:
6704 ; AVX512VLDQ: ## %bb.0:
6705 ; AVX512VLDQ-NEXT: movb $-121, %al
6706 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6707 ; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
6708 ; AVX512VLDQ-NEXT: retq
6710 ; AVX512VLBW-LABEL: mload_constmask_v8i32:
6711 ; AVX512VLBW: ## %bb.0:
6712 ; AVX512VLBW-NEXT: movb $-121, %al
6713 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6714 ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
6715 ; AVX512VLBW-NEXT: retq
6717 ; X86-AVX512-LABEL: mload_constmask_v8i32:
6718 ; X86-AVX512: ## %bb.0:
6719 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6720 ; X86-AVX512-NEXT: movb $-121, %cl
6721 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6722 ; X86-AVX512-NEXT: vmovdqu32 (%eax), %ymm0 {%k1}
6723 ; X86-AVX512-NEXT: retl
6724 %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
6728 define <4 x i64> @mload_constmask_v4i64(ptr %addr, <4 x i64> %dst) {
6729 ; SSE2-LABEL: mload_constmask_v4i64:
6731 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
6732 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
6733 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6736 ; SSE42-LABEL: mload_constmask_v4i64:
6738 ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm0
6739 ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm1
6742 ; AVX1OR2-LABEL: mload_constmask_v4i64:
6743 ; AVX1OR2: ## %bb.0:
6744 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
6745 ; AVX1OR2-NEXT: retq
6747 ; AVX512F-LABEL: mload_constmask_v4i64:
6748 ; AVX512F: ## %bb.0:
6749 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
6750 ; AVX512F-NEXT: movb $9, %al
6751 ; AVX512F-NEXT: kmovw %eax, %k1
6752 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1}
6753 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6754 ; AVX512F-NEXT: retq
6756 ; AVX512VLDQ-LABEL: mload_constmask_v4i64:
6757 ; AVX512VLDQ: ## %bb.0:
6758 ; AVX512VLDQ-NEXT: movb $9, %al
6759 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6760 ; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
6761 ; AVX512VLDQ-NEXT: retq
6763 ; AVX512VLBW-LABEL: mload_constmask_v4i64:
6764 ; AVX512VLBW: ## %bb.0:
6765 ; AVX512VLBW-NEXT: movb $9, %al
6766 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6767 ; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
6768 ; AVX512VLBW-NEXT: retq
6770 ; X86-AVX512-LABEL: mload_constmask_v4i64:
6771 ; X86-AVX512: ## %bb.0:
6772 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6773 ; X86-AVX512-NEXT: movb $9, %cl
6774 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6775 ; X86-AVX512-NEXT: vmovdqu64 (%eax), %ymm0 {%k1}
6776 ; X86-AVX512-NEXT: retl
6777 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
6781 ; 512-bit FP vectors are supported with AVX512.
6783 define <8 x double> @mload_constmask_v8f64(ptr %addr, <8 x double> %dst) {
6784 ; SSE-LABEL: mload_constmask_v8f64:
6786 ; SSE-NEXT: movups (%rdi), %xmm0
6787 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
6788 ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
6791 ; AVX1OR2-LABEL: mload_constmask_v8f64:
6792 ; AVX1OR2: ## %bb.0:
6793 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
6794 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
6795 ; AVX1OR2-NEXT: retq
6797 ; AVX512F-LABEL: mload_constmask_v8f64:
6798 ; AVX512F: ## %bb.0:
6799 ; AVX512F-NEXT: movb $-121, %al
6800 ; AVX512F-NEXT: kmovw %eax, %k1
6801 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6802 ; AVX512F-NEXT: retq
6804 ; AVX512VLDQ-LABEL: mload_constmask_v8f64:
6805 ; AVX512VLDQ: ## %bb.0:
6806 ; AVX512VLDQ-NEXT: movb $-121, %al
6807 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6808 ; AVX512VLDQ-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6809 ; AVX512VLDQ-NEXT: retq
6811 ; AVX512VLBW-LABEL: mload_constmask_v8f64:
6812 ; AVX512VLBW: ## %bb.0:
6813 ; AVX512VLBW-NEXT: movb $-121, %al
6814 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6815 ; AVX512VLBW-NEXT: vmovupd (%rdi), %zmm0 {%k1}
6816 ; AVX512VLBW-NEXT: retq
6818 ; X86-AVX512-LABEL: mload_constmask_v8f64:
6819 ; X86-AVX512: ## %bb.0:
6820 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6821 ; X86-AVX512-NEXT: movb $-121, %cl
6822 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6823 ; X86-AVX512-NEXT: vmovupd (%eax), %zmm0 {%k1}
6824 ; X86-AVX512-NEXT: retl
6825 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
6826 ret <8 x double> %res
6829 ; Make sure we detect the mask is all ones after type
6830 ; legalization to use an unmasked load for some of the avx512 instructions.
6831 define <16 x double> @mload_constmask_v16f64_allones_split(ptr %addr, <16 x double> %dst) {
6832 ; SSE-LABEL: mload_constmask_v16f64_allones_split:
6834 ; SSE-NEXT: movq %rdi, %rax
6835 ; SSE-NEXT: movups (%rsi), %xmm0
6836 ; SSE-NEXT: movups 16(%rsi), %xmm1
6837 ; SSE-NEXT: movups 32(%rsi), %xmm2
6838 ; SSE-NEXT: movups 48(%rsi), %xmm3
6839 ; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
6840 ; SSE-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
6841 ; SSE-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
6842 ; SSE-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
6843 ; SSE-NEXT: movaps %xmm7, 112(%rdi)
6844 ; SSE-NEXT: movaps %xmm6, 96(%rdi)
6845 ; SSE-NEXT: movaps %xmm5, 80(%rdi)
6846 ; SSE-NEXT: movaps %xmm4, 64(%rdi)
6847 ; SSE-NEXT: movaps %xmm3, 48(%rdi)
6848 ; SSE-NEXT: movaps %xmm2, 32(%rdi)
6849 ; SSE-NEXT: movaps %xmm1, 16(%rdi)
6850 ; SSE-NEXT: movaps %xmm0, (%rdi)
6853 ; AVX1OR2-LABEL: mload_constmask_v16f64_allones_split:
6854 ; AVX1OR2: ## %bb.0:
6855 ; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
6856 ; AVX1OR2-NEXT: ## ymm0 = mem[0,1,0,1]
6857 ; AVX1OR2-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1
6858 ; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
6859 ; AVX1OR2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0
6860 ; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
6861 ; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
6862 ; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1
6863 ; AVX1OR2-NEXT: retq
6865 ; AVX512F-LABEL: mload_constmask_v16f64_allones_split:
6866 ; AVX512F: ## %bb.0:
6867 ; AVX512F-NEXT: movb $85, %al
6868 ; AVX512F-NEXT: kmovw %eax, %k1
6869 ; AVX512F-NEXT: vmovupd 64(%rdi), %zmm1 {%k1}
6870 ; AVX512F-NEXT: vmovups (%rdi), %zmm0
6871 ; AVX512F-NEXT: retq
6873 ; AVX512VLDQ-LABEL: mload_constmask_v16f64_allones_split:
6874 ; AVX512VLDQ: ## %bb.0:
6875 ; AVX512VLDQ-NEXT: movb $85, %al
6876 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6877 ; AVX512VLDQ-NEXT: vmovupd 64(%rdi), %zmm1 {%k1}
6878 ; AVX512VLDQ-NEXT: vmovups (%rdi), %zmm0
6879 ; AVX512VLDQ-NEXT: retq
6881 ; AVX512VLBW-LABEL: mload_constmask_v16f64_allones_split:
6882 ; AVX512VLBW: ## %bb.0:
6883 ; AVX512VLBW-NEXT: movb $85, %al
6884 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6885 ; AVX512VLBW-NEXT: vmovupd 64(%rdi), %zmm1 {%k1}
6886 ; AVX512VLBW-NEXT: vmovups (%rdi), %zmm0
6887 ; AVX512VLBW-NEXT: retq
6889 ; X86-AVX512-LABEL: mload_constmask_v16f64_allones_split:
6890 ; X86-AVX512: ## %bb.0:
6891 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6892 ; X86-AVX512-NEXT: movb $85, %cl
6893 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6894 ; X86-AVX512-NEXT: vmovupd 64(%eax), %zmm1 {%k1}
6895 ; X86-AVX512-NEXT: vmovups (%eax), %zmm0
6896 ; X86-AVX512-NEXT: retl
6897 %res = call <16 x double> @llvm.masked.load.v16f64.p0(ptr %addr, i32 4, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x double> %dst)
6898 ret <16 x double> %res
6901 ; If the pass-through operand is undef, no blend is needed.
6903 define <4 x double> @mload_constmask_v4f64_undef_passthrough(ptr %addr) {
6904 ; SSE-LABEL: mload_constmask_v4f64_undef_passthrough:
6906 ; SSE-NEXT: movups (%rdi), %xmm0
6907 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
6910 ; AVX1OR2-LABEL: mload_constmask_v4f64_undef_passthrough:
6911 ; AVX1OR2: ## %bb.0:
6912 ; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
6913 ; AVX1OR2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
6914 ; AVX1OR2-NEXT: retq
6916 ; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
6917 ; AVX512F: ## %bb.0:
6918 ; AVX512F-NEXT: movb $7, %al
6919 ; AVX512F-NEXT: kmovw %eax, %k1
6920 ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
6921 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6922 ; AVX512F-NEXT: retq
6924 ; AVX512VLDQ-LABEL: mload_constmask_v4f64_undef_passthrough:
6925 ; AVX512VLDQ: ## %bb.0:
6926 ; AVX512VLDQ-NEXT: movb $7, %al
6927 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6928 ; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
6929 ; AVX512VLDQ-NEXT: retq
6931 ; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough:
6932 ; AVX512VLBW: ## %bb.0:
6933 ; AVX512VLBW-NEXT: movb $7, %al
6934 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6935 ; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
6936 ; AVX512VLBW-NEXT: retq
6938 ; X86-AVX512-LABEL: mload_constmask_v4f64_undef_passthrough:
6939 ; X86-AVX512: ## %bb.0:
6940 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6941 ; X86-AVX512-NEXT: movb $7, %cl
6942 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6943 ; X86-AVX512-NEXT: vmovupd (%eax), %ymm0 {%k1} {z}
6944 ; X86-AVX512-NEXT: retl
6945 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
6946 ret <4 x double> %res
6949 define <4 x i64> @mload_constmask_v4i64_undef_passthrough(ptr %addr) {
6950 ; SSE-LABEL: mload_constmask_v4i64_undef_passthrough:
6952 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
6953 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
6954 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6957 ; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
6959 ; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
6960 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
6963 ; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
6965 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
6966 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
6969 ; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
6970 ; AVX512F: ## %bb.0:
6971 ; AVX512F-NEXT: movb $6, %al
6972 ; AVX512F-NEXT: kmovw %eax, %k1
6973 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
6974 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
6975 ; AVX512F-NEXT: retq
6977 ; AVX512VLDQ-LABEL: mload_constmask_v4i64_undef_passthrough:
6978 ; AVX512VLDQ: ## %bb.0:
6979 ; AVX512VLDQ-NEXT: movb $6, %al
6980 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
6981 ; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
6982 ; AVX512VLDQ-NEXT: retq
6984 ; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough:
6985 ; AVX512VLBW: ## %bb.0:
6986 ; AVX512VLBW-NEXT: movb $6, %al
6987 ; AVX512VLBW-NEXT: kmovd %eax, %k1
6988 ; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
6989 ; AVX512VLBW-NEXT: retq
6991 ; X86-AVX512-LABEL: mload_constmask_v4i64_undef_passthrough:
6992 ; X86-AVX512: ## %bb.0:
6993 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
6994 ; X86-AVX512-NEXT: movb $6, %cl
6995 ; X86-AVX512-NEXT: kmovd %ecx, %k1
6996 ; X86-AVX512-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} {z}
6997 ; X86-AVX512-NEXT: retl
6998 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
7002 ; When only one element of the mask is set, reduce to a scalar load.
7004 define <4 x i32> @load_one_mask_bit_set1(ptr %addr, <4 x i32> %val) {
7005 ; SSE2-LABEL: load_one_mask_bit_set1:
7007 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
7008 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
7011 ; SSE42-LABEL: load_one_mask_bit_set1:
7013 ; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0
7016 ; AVX-LABEL: load_one_mask_bit_set1:
7018 ; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
7021 ; X86-AVX512-LABEL: load_one_mask_bit_set1:
7022 ; X86-AVX512: ## %bb.0:
7023 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
7024 ; X86-AVX512-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0
7025 ; X86-AVX512-NEXT: retl
7026 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
7030 ; Choose a different element to show that the correct address offset is produced.
7032 define <4 x float> @load_one_mask_bit_set2(ptr %addr, <4 x float> %val) {
7033 ; SSE2-LABEL: load_one_mask_bit_set2:
7035 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
7036 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
7037 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
7040 ; SSE42-LABEL: load_one_mask_bit_set2:
7042 ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
7045 ; AVX-LABEL: load_one_mask_bit_set2:
7047 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
7050 ; X86-AVX512-LABEL: load_one_mask_bit_set2:
7051 ; X86-AVX512: ## %bb.0:
7052 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
7053 ; X86-AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
7054 ; X86-AVX512-NEXT: retl
7055 %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
7056 ret <4 x float> %res
7059 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
7061 define <4 x i64> @load_one_mask_bit_set3(ptr %addr, <4 x i64> %val) {
7062 ; SSE2-LABEL: load_one_mask_bit_set3:
7064 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
7067 ; SSE42-LABEL: load_one_mask_bit_set3:
7069 ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm1
7072 ; AVX-LABEL: load_one_mask_bit_set3:
7074 ; AVX-NEXT: vbroadcastsd 16(%rdi), %ymm1
7075 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7078 ; X86-AVX512-LABEL: load_one_mask_bit_set3:
7079 ; X86-AVX512: ## %bb.0:
7080 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
7081 ; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %ymm1
7082 ; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7083 ; X86-AVX512-NEXT: retl
7084 %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
7088 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
7090 define <4 x double> @load_one_mask_bit_set4(ptr %addr, <4 x double> %val) {
7091 ; SSE-LABEL: load_one_mask_bit_set4:
7093 ; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
7096 ; AVX-LABEL: load_one_mask_bit_set4:
7098 ; AVX-NEXT: vbroadcastsd 24(%rdi), %ymm1
7099 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7102 ; X86-AVX512-LABEL: load_one_mask_bit_set4:
7103 ; X86-AVX512: ## %bb.0:
7104 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
7105 ; X86-AVX512-NEXT: vbroadcastsd 24(%eax), %ymm1
7106 ; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7107 ; X86-AVX512-NEXT: retl
7108 %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
7109 ret <4 x double> %res
7112 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
7114 define <8 x double> @load_one_mask_bit_set5(ptr %addr, <8 x double> %val) {
7115 ; SSE-LABEL: load_one_mask_bit_set5:
7117 ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
7120 ; AVX1OR2-LABEL: load_one_mask_bit_set5:
7121 ; AVX1OR2: ## %bb.0:
7122 ; AVX1OR2-NEXT: vbroadcastsd 56(%rdi), %ymm2
7123 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
7124 ; AVX1OR2-NEXT: retq
7126 ; AVX512F-LABEL: load_one_mask_bit_set5:
7127 ; AVX512F: ## %bb.0:
7128 ; AVX512F-NEXT: movb $-128, %al
7129 ; AVX512F-NEXT: kmovw %eax, %k1
7130 ; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
7131 ; AVX512F-NEXT: retq
7133 ; AVX512VLDQ-LABEL: load_one_mask_bit_set5:
7134 ; AVX512VLDQ: ## %bb.0:
7135 ; AVX512VLDQ-NEXT: movb $-128, %al
7136 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
7137 ; AVX512VLDQ-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
7138 ; AVX512VLDQ-NEXT: retq
7140 ; AVX512VLBW-LABEL: load_one_mask_bit_set5:
7141 ; AVX512VLBW: ## %bb.0:
7142 ; AVX512VLBW-NEXT: movb $-128, %al
7143 ; AVX512VLBW-NEXT: kmovd %eax, %k1
7144 ; AVX512VLBW-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1}
7145 ; AVX512VLBW-NEXT: retq
7147 ; X86-AVX512-LABEL: load_one_mask_bit_set5:
7148 ; X86-AVX512: ## %bb.0:
7149 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
7150 ; X86-AVX512-NEXT: movb $-128, %cl
7151 ; X86-AVX512-NEXT: kmovd %ecx, %k1
7152 ; X86-AVX512-NEXT: vbroadcastsd 56(%eax), %zmm0 {%k1}
7153 ; X86-AVX512-NEXT: retl
7154 %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
7155 ret <8 x double> %res
7158 define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) {
7159 ; SSE2-LABEL: load_one_mask_bit_set6:
7161 ; SSE2-NEXT: movq %rdi, %rax
7162 ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
7163 ; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
7164 ; SSE2-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero
7165 ; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0]
7166 ; SSE2-NEXT: movaps %xmm7, 112(%rdi)
7167 ; SSE2-NEXT: movaps %xmm5, 80(%rdi)
7168 ; SSE2-NEXT: movaps %xmm4, 64(%rdi)
7169 ; SSE2-NEXT: movaps %xmm3, 48(%rdi)
7170 ; SSE2-NEXT: movaps %xmm2, 32(%rdi)
7171 ; SSE2-NEXT: movaps %xmm1, 16(%rdi)
7172 ; SSE2-NEXT: movaps %xmm0, (%rdi)
7173 ; SSE2-NEXT: movaps %xmm6, 96(%rdi)
7176 ; SSE42-LABEL: load_one_mask_bit_set6:
7178 ; SSE42-NEXT: movq %rdi, %rax
7179 ; SSE42-NEXT: pinsrq $0, 16(%rsi), %xmm1
7180 ; SSE42-NEXT: pinsrq $0, 80(%rsi), %xmm5
7181 ; SSE42-NEXT: pinsrq $1, 104(%rsi), %xmm6
7182 ; SSE42-NEXT: movaps %xmm7, 112(%rdi)
7183 ; SSE42-NEXT: movdqa %xmm6, 96(%rdi)
7184 ; SSE42-NEXT: movdqa %xmm5, 80(%rdi)
7185 ; SSE42-NEXT: movaps %xmm4, 64(%rdi)
7186 ; SSE42-NEXT: movaps %xmm3, 48(%rdi)
7187 ; SSE42-NEXT: movaps %xmm2, 32(%rdi)
7188 ; SSE42-NEXT: movdqa %xmm1, 16(%rdi)
7189 ; SSE42-NEXT: movaps %xmm0, (%rdi)
7192 ; AVX1-LABEL: load_one_mask_bit_set6:
7194 ; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,18446744073709551615,0]
7195 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm5
7196 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3]
7197 ; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm4, %ymm4
7198 ; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3]
7199 ; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [0,18446744073709551615,0,0]
7200 ; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm4, %ymm4
7201 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3]
7204 ; AVX2-LABEL: load_one_mask_bit_set6:
7206 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,18446744073709551615,0]
7207 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm4, %ymm5
7208 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
7209 ; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm4, %ymm4
7210 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
7211 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,18446744073709551615,0,0]
7212 ; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm4, %ymm4
7213 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
7216 ; AVX512F-LABEL: load_one_mask_bit_set6:
7217 ; AVX512F: ## %bb.0:
7218 ; AVX512F-NEXT: movb $4, %al
7219 ; AVX512F-NEXT: kmovw %eax, %k1
7220 ; AVX512F-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1}
7221 ; AVX512F-NEXT: movb $36, %al
7222 ; AVX512F-NEXT: kmovw %eax, %k1
7223 ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1}
7224 ; AVX512F-NEXT: retq
7226 ; AVX512VLDQ-LABEL: load_one_mask_bit_set6:
7227 ; AVX512VLDQ: ## %bb.0:
7228 ; AVX512VLDQ-NEXT: movb $4, %al
7229 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
7230 ; AVX512VLDQ-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1}
7231 ; AVX512VLDQ-NEXT: movb $36, %al
7232 ; AVX512VLDQ-NEXT: kmovw %eax, %k1
7233 ; AVX512VLDQ-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1}
7234 ; AVX512VLDQ-NEXT: retq
7236 ; AVX512VLBW-LABEL: load_one_mask_bit_set6:
7237 ; AVX512VLBW: ## %bb.0:
7238 ; AVX512VLBW-NEXT: movb $4, %al
7239 ; AVX512VLBW-NEXT: kmovd %eax, %k1
7240 ; AVX512VLBW-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1}
7241 ; AVX512VLBW-NEXT: movb $36, %al
7242 ; AVX512VLBW-NEXT: kmovd %eax, %k1
7243 ; AVX512VLBW-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1}
7244 ; AVX512VLBW-NEXT: retq
7246 ; X86-AVX512-LABEL: load_one_mask_bit_set6:
7247 ; X86-AVX512: ## %bb.0:
7248 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
7249 ; X86-AVX512-NEXT: movb $4, %cl
7250 ; X86-AVX512-NEXT: kmovd %ecx, %k1
7251 ; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %zmm0 {%k1}
7252 ; X86-AVX512-NEXT: movb $36, %cl
7253 ; X86-AVX512-NEXT: kmovd %ecx, %k1
7254 ; X86-AVX512-NEXT: vmovdqu64 64(%eax), %zmm1 {%k1}
7255 ; X86-AVX512-NEXT: retl
7256 %res = call <16 x i64> @llvm.masked.load.v16i64.p0(ptr %addr, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>, <16 x i64> %val)
7260 define i32 @pr38986(i1 %c, ptr %p) {
7261 ; SSE-LABEL: pr38986:
7263 ; SSE-NEXT: testb $1, %dil
7264 ; SSE-NEXT: ## implicit-def: $eax
7265 ; SSE-NEXT: je LBB45_2
7266 ; SSE-NEXT: ## %bb.1: ## %cond.load
7267 ; SSE-NEXT: movl (%rsi), %eax
7268 ; SSE-NEXT: LBB45_2: ## %else
7271 ; AVX-LABEL: pr38986:
7273 ; AVX-NEXT: testb $1, %dil
7274 ; AVX-NEXT: ## implicit-def: $eax
7275 ; AVX-NEXT: je LBB45_2
7276 ; AVX-NEXT: ## %bb.1: ## %cond.load
7277 ; AVX-NEXT: movl (%rsi), %eax
7278 ; AVX-NEXT: LBB45_2: ## %else
7281 ; X86-AVX512-LABEL: pr38986:
7282 ; X86-AVX512: ## %bb.0:
7283 ; X86-AVX512-NEXT: testb $1, {{[0-9]+}}(%esp)
7284 ; X86-AVX512-NEXT: ## implicit-def: $eax
7285 ; X86-AVX512-NEXT: je LBB45_2
7286 ; X86-AVX512-NEXT: ## %bb.1: ## %cond.load
7287 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
7288 ; X86-AVX512-NEXT: movl (%eax), %eax
7289 ; X86-AVX512-NEXT: LBB45_2: ## %else
7290 ; X86-AVX512-NEXT: retl
7291 %vc = insertelement <1 x i1> undef, i1 %c, i32 0
7292 %L = call <1 x i32> @llvm.masked.load.v1i32.p0 (ptr %p, i32 4, <1 x i1> %vc, <1 x i32> undef)
7293 %ret = bitcast <1 x i32> %L to i32
7297 define <2 x double> @zero_mask(ptr %addr, <2 x double> %dst) {
7298 ; SSE-LABEL: zero_mask:
7302 ; AVX-LABEL: zero_mask:
7306 ; X86-AVX512-LABEL: zero_mask:
7307 ; X86-AVX512: ## %bb.0:
7308 ; X86-AVX512-NEXT: retl
7309 %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> zeroinitializer, <2 x double> %dst)
7310 ret <2 x double> %res
7313 declare <16 x double> @llvm.masked.load.v16f64.p0(ptr, i32, <16 x i1>, <16 x double>)
7314 declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
7315 declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
7316 declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
7317 declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
7319 declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
7320 declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
7321 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
7322 declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
7324 declare <16 x i64> @llvm.masked.load.v16i64.p0(ptr, i32, <16 x i1>, <16 x i64>)
7325 declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
7326 declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
7327 declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
7328 declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
7330 declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
7331 declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
7332 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
7333 declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
7334 declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
7336 declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
7337 declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
7338 declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
7339 declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
7341 declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
7342 declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
7343 declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
7344 declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)