1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlvbmi2-builtins.c
7 define <2 x i64> @test_mm_mask_compress_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__D) {
8 ; X86-LABEL: test_mm_mask_compress_epi16:
9 ; X86: # %bb.0: # %entry
10 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
11 ; X86-NEXT: kmovd %eax, %k1
12 ; X86-NEXT: vpcompressw %xmm1, %xmm0 {%k1}
15 ; X64-LABEL: test_mm_mask_compress_epi16:
16 ; X64: # %bb.0: # %entry
17 ; X64-NEXT: kmovd %edi, %k1
18 ; X64-NEXT: vpcompressw %xmm1, %xmm0 {%k1}
21 %0 = bitcast <2 x i64> %__D to <8 x i16>
22 %1 = bitcast <2 x i64> %__S to <8 x i16>
23 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %0, <8 x i16> %1, i8 %__U)
24 %3 = bitcast <8 x i16> %2 to <2 x i64>
28 define <2 x i64> @test_mm_maskz_compress_epi16(i8 zeroext %__U, <2 x i64> %__D) {
29 ; X86-LABEL: test_mm_maskz_compress_epi16:
30 ; X86: # %bb.0: # %entry
31 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
32 ; X86-NEXT: kmovd %eax, %k1
33 ; X86-NEXT: vpcompressw %xmm0, %xmm0 {%k1} {z}
36 ; X64-LABEL: test_mm_maskz_compress_epi16:
37 ; X64: # %bb.0: # %entry
38 ; X64-NEXT: kmovd %edi, %k1
39 ; X64-NEXT: vpcompressw %xmm0, %xmm0 {%k1} {z}
42 %0 = bitcast <2 x i64> %__D to <8 x i16>
43 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %0, <8 x i16> zeroinitializer, i8 %__U)
44 %2 = bitcast <8 x i16> %1 to <2 x i64>
48 define <2 x i64> @test_mm_mask_compress_epi8(<2 x i64> %__S, i16 zeroext %__U, <2 x i64> %__D) {
49 ; X86-LABEL: test_mm_mask_compress_epi8:
50 ; X86: # %bb.0: # %entry
51 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
52 ; X86-NEXT: vpcompressb %xmm1, %xmm0 {%k1}
55 ; X64-LABEL: test_mm_mask_compress_epi8:
56 ; X64: # %bb.0: # %entry
57 ; X64-NEXT: kmovd %edi, %k1
58 ; X64-NEXT: vpcompressb %xmm1, %xmm0 {%k1}
61 %0 = bitcast <2 x i64> %__D to <16 x i8>
62 %1 = bitcast <2 x i64> %__S to <16 x i8>
63 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %0, <16 x i8> %1, i16 %__U)
64 %3 = bitcast <16 x i8> %2 to <2 x i64>
68 define <2 x i64> @test_mm_maskz_compress_epi8(i16 zeroext %__U, <2 x i64> %__D) {
69 ; X86-LABEL: test_mm_maskz_compress_epi8:
70 ; X86: # %bb.0: # %entry
71 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
72 ; X86-NEXT: vpcompressb %xmm0, %xmm0 {%k1} {z}
75 ; X64-LABEL: test_mm_maskz_compress_epi8:
76 ; X64: # %bb.0: # %entry
77 ; X64-NEXT: kmovd %edi, %k1
78 ; X64-NEXT: vpcompressb %xmm0, %xmm0 {%k1} {z}
81 %0 = bitcast <2 x i64> %__D to <16 x i8>
82 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %0, <16 x i8> zeroinitializer, i16 %__U)
83 %2 = bitcast <16 x i8> %1 to <2 x i64>
87 define void @test_mm_mask_compressstoreu_epi16(i8* %__P, i8 zeroext %__U, <2 x i64> %__D) {
88 ; X86-LABEL: test_mm_mask_compressstoreu_epi16:
89 ; X86: # %bb.0: # %entry
90 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
91 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
92 ; X86-NEXT: kmovd %eax, %k1
93 ; X86-NEXT: vpcompressw %xmm0, (%ecx) {%k1}
96 ; X64-LABEL: test_mm_mask_compressstoreu_epi16:
97 ; X64: # %bb.0: # %entry
98 ; X64-NEXT: kmovd %esi, %k1
99 ; X64-NEXT: vpcompressw %xmm0, (%rdi) {%k1}
102 %0 = bitcast <2 x i64> %__D to <8 x i16>
103 %1 = bitcast i8* %__P to i16*
104 %2 = bitcast i8 %__U to <8 x i1>
105 tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %0, i16* %1, <8 x i1> %2)
109 define void @test_mm_mask_compressstoreu_epi8(i8* %__P, i16 zeroext %__U, <2 x i64> %__D) {
110 ; X86-LABEL: test_mm_mask_compressstoreu_epi8:
111 ; X86: # %bb.0: # %entry
112 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
113 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
114 ; X86-NEXT: vpcompressb %xmm0, (%eax) {%k1}
117 ; X64-LABEL: test_mm_mask_compressstoreu_epi8:
118 ; X64: # %bb.0: # %entry
119 ; X64-NEXT: kmovd %esi, %k1
120 ; X64-NEXT: vpcompressb %xmm0, (%rdi) {%k1}
123 %0 = bitcast <2 x i64> %__D to <16 x i8>
124 %1 = bitcast i16 %__U to <16 x i1>
125 tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %0, i8* %__P, <16 x i1> %1)
129 define <2 x i64> @test_mm_mask_expand_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__D) {
130 ; X86-LABEL: test_mm_mask_expand_epi16:
131 ; X86: # %bb.0: # %entry
132 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
133 ; X86-NEXT: kmovd %eax, %k1
134 ; X86-NEXT: vpexpandw %xmm1, %xmm0 {%k1}
137 ; X64-LABEL: test_mm_mask_expand_epi16:
138 ; X64: # %bb.0: # %entry
139 ; X64-NEXT: kmovd %edi, %k1
140 ; X64-NEXT: vpexpandw %xmm1, %xmm0 {%k1}
143 %0 = bitcast <2 x i64> %__D to <8 x i16>
144 %1 = bitcast <2 x i64> %__S to <8 x i16>
145 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %0, <8 x i16> %1, i8 %__U)
146 %3 = bitcast <8 x i16> %2 to <2 x i64>
150 define <2 x i64> @test_mm_maskz_expand_epi16(i8 zeroext %__U, <2 x i64> %__D) {
151 ; X86-LABEL: test_mm_maskz_expand_epi16:
152 ; X86: # %bb.0: # %entry
153 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
154 ; X86-NEXT: kmovd %eax, %k1
155 ; X86-NEXT: vpexpandw %xmm0, %xmm0 {%k1} {z}
158 ; X64-LABEL: test_mm_maskz_expand_epi16:
159 ; X64: # %bb.0: # %entry
160 ; X64-NEXT: kmovd %edi, %k1
161 ; X64-NEXT: vpexpandw %xmm0, %xmm0 {%k1} {z}
164 %0 = bitcast <2 x i64> %__D to <8 x i16>
165 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %0, <8 x i16> zeroinitializer, i8 %__U)
166 %2 = bitcast <8 x i16> %1 to <2 x i64>
170 define <2 x i64> @test_mm_mask_expand_epi8(<2 x i64> %__S, i16 zeroext %__U, <2 x i64> %__D) {
171 ; X86-LABEL: test_mm_mask_expand_epi8:
172 ; X86: # %bb.0: # %entry
173 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
174 ; X86-NEXT: vpexpandb %xmm1, %xmm0 {%k1}
177 ; X64-LABEL: test_mm_mask_expand_epi8:
178 ; X64: # %bb.0: # %entry
179 ; X64-NEXT: kmovd %edi, %k1
180 ; X64-NEXT: vpexpandb %xmm1, %xmm0 {%k1}
183 %0 = bitcast <2 x i64> %__D to <16 x i8>
184 %1 = bitcast <2 x i64> %__S to <16 x i8>
185 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %0, <16 x i8> %1, i16 %__U)
186 %3 = bitcast <16 x i8> %2 to <2 x i64>
190 define <2 x i64> @test_mm_maskz_expand_epi8(i16 zeroext %__U, <2 x i64> %__D) {
191 ; X86-LABEL: test_mm_maskz_expand_epi8:
192 ; X86: # %bb.0: # %entry
193 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
194 ; X86-NEXT: vpexpandb %xmm0, %xmm0 {%k1} {z}
197 ; X64-LABEL: test_mm_maskz_expand_epi8:
198 ; X64: # %bb.0: # %entry
199 ; X64-NEXT: kmovd %edi, %k1
200 ; X64-NEXT: vpexpandb %xmm0, %xmm0 {%k1} {z}
203 %0 = bitcast <2 x i64> %__D to <16 x i8>
204 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %0, <16 x i8> zeroinitializer, i16 %__U)
205 %2 = bitcast <16 x i8> %1 to <2 x i64>
209 define <2 x i64> @test_mm_mask_expandloadu_epi16(<2 x i64> %__S, i8 zeroext %__U, i8* readonly %__P) {
210 ; X86-LABEL: test_mm_mask_expandloadu_epi16:
211 ; X86: # %bb.0: # %entry
212 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
213 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
214 ; X86-NEXT: kmovd %ecx, %k1
215 ; X86-NEXT: vpexpandw (%eax), %xmm0 {%k1}
218 ; X64-LABEL: test_mm_mask_expandloadu_epi16:
219 ; X64: # %bb.0: # %entry
220 ; X64-NEXT: kmovd %edi, %k1
221 ; X64-NEXT: vpexpandw (%rsi), %xmm0 {%k1}
224 %0 = bitcast <2 x i64> %__S to <8 x i16>
225 %1 = bitcast i8* %__P to i16*
226 %2 = bitcast i8 %__U to <8 x i1>
227 %3 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %1, <8 x i1> %2, <8 x i16> %0)
228 %4 = bitcast <8 x i16> %3 to <2 x i64>
232 define <2 x i64> @test_mm_maskz_expandloadu_epi16(i8 zeroext %__U, i8* readonly %__P) {
233 ; X86-LABEL: test_mm_maskz_expandloadu_epi16:
234 ; X86: # %bb.0: # %entry
235 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
236 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
237 ; X86-NEXT: kmovd %ecx, %k1
238 ; X86-NEXT: vpexpandw (%eax), %xmm0 {%k1} {z}
241 ; X64-LABEL: test_mm_maskz_expandloadu_epi16:
242 ; X64: # %bb.0: # %entry
243 ; X64-NEXT: kmovd %edi, %k1
244 ; X64-NEXT: vpexpandw (%rsi), %xmm0 {%k1} {z}
247 %0 = bitcast i8* %__P to i16*
248 %1 = bitcast i8 %__U to <8 x i1>
249 %2 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %0, <8 x i1> %1, <8 x i16> zeroinitializer)
250 %3 = bitcast <8 x i16> %2 to <2 x i64>
254 define <2 x i64> @test_mm_mask_expandloadu_epi8(<2 x i64> %__S, i16 zeroext %__U, i8* readonly %__P) {
255 ; X86-LABEL: test_mm_mask_expandloadu_epi8:
256 ; X86: # %bb.0: # %entry
257 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
258 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
259 ; X86-NEXT: vpexpandb (%eax), %xmm0 {%k1}
262 ; X64-LABEL: test_mm_mask_expandloadu_epi8:
263 ; X64: # %bb.0: # %entry
264 ; X64-NEXT: kmovd %edi, %k1
265 ; X64-NEXT: vpexpandb (%rsi), %xmm0 {%k1}
268 %0 = bitcast <2 x i64> %__S to <16 x i8>
269 %1 = bitcast i16 %__U to <16 x i1>
270 %2 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %1, <16 x i8> %0)
271 %3 = bitcast <16 x i8> %2 to <2 x i64>
275 define <2 x i64> @test_mm_maskz_expandloadu_epi8(i16 zeroext %__U, i8* readonly %__P) {
276 ; X86-LABEL: test_mm_maskz_expandloadu_epi8:
277 ; X86: # %bb.0: # %entry
278 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
279 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
280 ; X86-NEXT: vpexpandb (%eax), %xmm0 {%k1} {z}
283 ; X64-LABEL: test_mm_maskz_expandloadu_epi8:
284 ; X64: # %bb.0: # %entry
285 ; X64-NEXT: kmovd %edi, %k1
286 ; X64-NEXT: vpexpandb (%rsi), %xmm0 {%k1} {z}
289 %0 = bitcast i16 %__U to <16 x i1>
290 %1 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %0, <16 x i8> zeroinitializer)
291 %2 = bitcast <16 x i8> %1 to <2 x i64>
295 define <4 x i64> @test_mm256_mask_compress_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) {
296 ; X86-LABEL: test_mm256_mask_compress_epi16:
297 ; X86: # %bb.0: # %entry
298 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
299 ; X86-NEXT: vpcompressw %ymm1, %ymm0 {%k1}
302 ; X64-LABEL: test_mm256_mask_compress_epi16:
303 ; X64: # %bb.0: # %entry
304 ; X64-NEXT: kmovd %edi, %k1
305 ; X64-NEXT: vpcompressw %ymm1, %ymm0 {%k1}
308 %0 = bitcast <4 x i64> %__D to <16 x i16>
309 %1 = bitcast <4 x i64> %__S to <16 x i16>
310 %2 = tail call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %0, <16 x i16> %1, i16 %__U)
311 %3 = bitcast <16 x i16> %2 to <4 x i64>
315 define <4 x i64> @test_mm256_maskz_compress_epi16(i16 zeroext %__U, <4 x i64> %__D) {
316 ; X86-LABEL: test_mm256_maskz_compress_epi16:
317 ; X86: # %bb.0: # %entry
318 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
319 ; X86-NEXT: vpcompressw %ymm0, %ymm0 {%k1} {z}
322 ; X64-LABEL: test_mm256_maskz_compress_epi16:
323 ; X64: # %bb.0: # %entry
324 ; X64-NEXT: kmovd %edi, %k1
325 ; X64-NEXT: vpcompressw %ymm0, %ymm0 {%k1} {z}
328 %0 = bitcast <4 x i64> %__D to <16 x i16>
329 %1 = tail call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %0, <16 x i16> zeroinitializer, i16 %__U)
330 %2 = bitcast <16 x i16> %1 to <4 x i64>
334 define <4 x i64> @test_mm256_mask_compress_epi8(<4 x i64> %__S, i32 %__U, <4 x i64> %__D) {
335 ; X86-LABEL: test_mm256_mask_compress_epi8:
336 ; X86: # %bb.0: # %entry
337 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
338 ; X86-NEXT: vpcompressb %ymm1, %ymm0 {%k1}
341 ; X64-LABEL: test_mm256_mask_compress_epi8:
342 ; X64: # %bb.0: # %entry
343 ; X64-NEXT: kmovd %edi, %k1
344 ; X64-NEXT: vpcompressb %ymm1, %ymm0 {%k1}
347 %0 = bitcast <4 x i64> %__D to <32 x i8>
348 %1 = bitcast <4 x i64> %__S to <32 x i8>
349 %2 = tail call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %0, <32 x i8> %1, i32 %__U)
350 %3 = bitcast <32 x i8> %2 to <4 x i64>
354 define <4 x i64> @test_mm256_maskz_compress_epi8(i32 %__U, <4 x i64> %__D) {
355 ; X86-LABEL: test_mm256_maskz_compress_epi8:
356 ; X86: # %bb.0: # %entry
357 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
358 ; X86-NEXT: vpcompressb %ymm0, %ymm0 {%k1} {z}
361 ; X64-LABEL: test_mm256_maskz_compress_epi8:
362 ; X64: # %bb.0: # %entry
363 ; X64-NEXT: kmovd %edi, %k1
364 ; X64-NEXT: vpcompressb %ymm0, %ymm0 {%k1} {z}
367 %0 = bitcast <4 x i64> %__D to <32 x i8>
368 %1 = tail call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %0, <32 x i8> zeroinitializer, i32 %__U)
369 %2 = bitcast <32 x i8> %1 to <4 x i64>
373 define void @test_mm256_mask_compressstoreu_epi16(i8* %__P, i16 zeroext %__U, <4 x i64> %__D) {
374 ; X86-LABEL: test_mm256_mask_compressstoreu_epi16:
375 ; X86: # %bb.0: # %entry
376 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
377 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
378 ; X86-NEXT: vpcompressw %ymm0, (%eax) {%k1}
379 ; X86-NEXT: vzeroupper
382 ; X64-LABEL: test_mm256_mask_compressstoreu_epi16:
383 ; X64: # %bb.0: # %entry
384 ; X64-NEXT: kmovd %esi, %k1
385 ; X64-NEXT: vpcompressw %ymm0, (%rdi) {%k1}
386 ; X64-NEXT: vzeroupper
389 %0 = bitcast <4 x i64> %__D to <16 x i16>
390 %1 = bitcast i8* %__P to i16*
391 %2 = bitcast i16 %__U to <16 x i1>
392 tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %0, i16* %1, <16 x i1> %2)
396 define void @test_mm256_mask_compressstoreu_epi8(i8* %__P, i32 %__U, <4 x i64> %__D) {
397 ; X86-LABEL: test_mm256_mask_compressstoreu_epi8:
398 ; X86: # %bb.0: # %entry
399 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
400 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
401 ; X86-NEXT: vpcompressb %ymm0, (%eax) {%k1}
402 ; X86-NEXT: vzeroupper
405 ; X64-LABEL: test_mm256_mask_compressstoreu_epi8:
406 ; X64: # %bb.0: # %entry
407 ; X64-NEXT: kmovd %esi, %k1
408 ; X64-NEXT: vpcompressb %ymm0, (%rdi) {%k1}
409 ; X64-NEXT: vzeroupper
412 %0 = bitcast <4 x i64> %__D to <32 x i8>
413 %1 = bitcast i32 %__U to <32 x i1>
414 tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %0, i8* %__P, <32 x i1> %1)
418 define <4 x i64> @test_mm256_mask_expand_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) {
419 ; X86-LABEL: test_mm256_mask_expand_epi16:
420 ; X86: # %bb.0: # %entry
421 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
422 ; X86-NEXT: vpexpandw %ymm1, %ymm0 {%k1}
425 ; X64-LABEL: test_mm256_mask_expand_epi16:
426 ; X64: # %bb.0: # %entry
427 ; X64-NEXT: kmovd %edi, %k1
428 ; X64-NEXT: vpexpandw %ymm1, %ymm0 {%k1}
431 %0 = bitcast <4 x i64> %__D to <16 x i16>
432 %1 = bitcast <4 x i64> %__S to <16 x i16>
433 %2 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %0, <16 x i16> %1, i16 %__U)
434 %3 = bitcast <16 x i16> %2 to <4 x i64>
438 define <4 x i64> @test_mm256_maskz_expand_epi16(i16 zeroext %__U, <4 x i64> %__D) {
439 ; X86-LABEL: test_mm256_maskz_expand_epi16:
440 ; X86: # %bb.0: # %entry
441 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
442 ; X86-NEXT: vpexpandw %ymm0, %ymm0 {%k1} {z}
445 ; X64-LABEL: test_mm256_maskz_expand_epi16:
446 ; X64: # %bb.0: # %entry
447 ; X64-NEXT: kmovd %edi, %k1
448 ; X64-NEXT: vpexpandw %ymm0, %ymm0 {%k1} {z}
451 %0 = bitcast <4 x i64> %__D to <16 x i16>
452 %1 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %0, <16 x i16> zeroinitializer, i16 %__U)
453 %2 = bitcast <16 x i16> %1 to <4 x i64>
457 define <4 x i64> @test_mm256_mask_expand_epi8(<4 x i64> %__S, i32 %__U, <4 x i64> %__D) {
458 ; X86-LABEL: test_mm256_mask_expand_epi8:
459 ; X86: # %bb.0: # %entry
460 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
461 ; X86-NEXT: vpexpandb %ymm1, %ymm0 {%k1}
464 ; X64-LABEL: test_mm256_mask_expand_epi8:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovd %edi, %k1
467 ; X64-NEXT: vpexpandb %ymm1, %ymm0 {%k1}
470 %0 = bitcast <4 x i64> %__D to <32 x i8>
471 %1 = bitcast <4 x i64> %__S to <32 x i8>
472 %2 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %0, <32 x i8> %1, i32 %__U)
473 %3 = bitcast <32 x i8> %2 to <4 x i64>
477 define <4 x i64> @test_mm256_maskz_expand_epi8(i32 %__U, <4 x i64> %__D) {
478 ; X86-LABEL: test_mm256_maskz_expand_epi8:
479 ; X86: # %bb.0: # %entry
480 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
481 ; X86-NEXT: vpexpandb %ymm0, %ymm0 {%k1} {z}
484 ; X64-LABEL: test_mm256_maskz_expand_epi8:
485 ; X64: # %bb.0: # %entry
486 ; X64-NEXT: kmovd %edi, %k1
487 ; X64-NEXT: vpexpandb %ymm0, %ymm0 {%k1} {z}
490 %0 = bitcast <4 x i64> %__D to <32 x i8>
491 %1 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %0, <32 x i8> zeroinitializer, i32 %__U)
492 %2 = bitcast <32 x i8> %1 to <4 x i64>
496 define <4 x i64> @test_mm256_mask_expandloadu_epi16(<4 x i64> %__S, i16 zeroext %__U, i8* readonly %__P) {
497 ; X86-LABEL: test_mm256_mask_expandloadu_epi16:
498 ; X86: # %bb.0: # %entry
499 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
500 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
501 ; X86-NEXT: vpexpandw (%eax), %ymm0 {%k1}
504 ; X64-LABEL: test_mm256_mask_expandloadu_epi16:
505 ; X64: # %bb.0: # %entry
506 ; X64-NEXT: kmovd %edi, %k1
507 ; X64-NEXT: vpexpandw (%rsi), %ymm0 {%k1}
510 %0 = bitcast <4 x i64> %__S to <16 x i16>
511 %1 = bitcast i8* %__P to i16*
512 %2 = bitcast i16 %__U to <16 x i1>
513 %3 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %1, <16 x i1> %2, <16 x i16> %0)
514 %4 = bitcast <16 x i16> %3 to <4 x i64>
518 define <4 x i64> @test_mm256_maskz_expandloadu_epi16(i16 zeroext %__U, i8* readonly %__P) {
519 ; X86-LABEL: test_mm256_maskz_expandloadu_epi16:
520 ; X86: # %bb.0: # %entry
521 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
522 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
523 ; X86-NEXT: vpexpandw (%eax), %ymm0 {%k1} {z}
526 ; X64-LABEL: test_mm256_maskz_expandloadu_epi16:
527 ; X64: # %bb.0: # %entry
528 ; X64-NEXT: kmovd %edi, %k1
529 ; X64-NEXT: vpexpandw (%rsi), %ymm0 {%k1} {z}
532 %0 = bitcast i8* %__P to i16*
533 %1 = bitcast i16 %__U to <16 x i1>
534 %2 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %0, <16 x i1> %1, <16 x i16> zeroinitializer)
535 %3 = bitcast <16 x i16> %2 to <4 x i64>
539 define <4 x i64> @test_mm256_mask_expandloadu_epi8(<4 x i64> %__S, i32 %__U, i8* readonly %__P) {
540 ; X86-LABEL: test_mm256_mask_expandloadu_epi8:
541 ; X86: # %bb.0: # %entry
542 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
543 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
544 ; X86-NEXT: vpexpandb (%eax), %ymm0 {%k1}
547 ; X64-LABEL: test_mm256_mask_expandloadu_epi8:
548 ; X64: # %bb.0: # %entry
549 ; X64-NEXT: kmovd %edi, %k1
550 ; X64-NEXT: vpexpandb (%rsi), %ymm0 {%k1}
553 %0 = bitcast <4 x i64> %__S to <32 x i8>
554 %1 = bitcast i32 %__U to <32 x i1>
555 %2 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %1, <32 x i8> %0)
556 %3 = bitcast <32 x i8> %2 to <4 x i64>
560 define <4 x i64> @test_mm256_maskz_expandloadu_epi8(i32 %__U, i8* readonly %__P) {
561 ; X86-LABEL: test_mm256_maskz_expandloadu_epi8:
562 ; X86: # %bb.0: # %entry
563 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
564 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
565 ; X86-NEXT: vpexpandb (%eax), %ymm0 {%k1} {z}
568 ; X64-LABEL: test_mm256_maskz_expandloadu_epi8:
569 ; X64: # %bb.0: # %entry
570 ; X64-NEXT: kmovd %edi, %k1
571 ; X64-NEXT: vpexpandb (%rsi), %ymm0 {%k1} {z}
574 %0 = bitcast i32 %__U to <32 x i1>
575 %1 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %0, <32 x i8> zeroinitializer)
576 %2 = bitcast <32 x i8> %1 to <4 x i64>
580 define <4 x i64> @test_mm256_mask_shldi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
581 ; X86-LABEL: test_mm256_mask_shldi_epi64:
582 ; X86: # %bb.0: # %entry
583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
584 ; X86-NEXT: kmovd %eax, %k1
585 ; X86-NEXT: vpshldq $47, %ymm2, %ymm1, %ymm0 {%k1}
588 ; X64-LABEL: test_mm256_mask_shldi_epi64:
589 ; X64: # %bb.0: # %entry
590 ; X64-NEXT: kmovd %edi, %k1
591 ; X64-NEXT: vpshldq $47, %ymm2, %ymm1, %ymm0 {%k1}
594 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 47, i64 47, i64 47, i64 47>)
595 %1 = bitcast i8 %__U to <8 x i1>
596 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
597 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
601 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
603 define <4 x i64> @test_mm256_maskz_shldi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
604 ; X86-LABEL: test_mm256_maskz_shldi_epi64:
605 ; X86: # %bb.0: # %entry
606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
607 ; X86-NEXT: kmovd %eax, %k1
608 ; X86-NEXT: vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
611 ; X64-LABEL: test_mm256_maskz_shldi_epi64:
612 ; X64: # %bb.0: # %entry
613 ; X64-NEXT: kmovd %edi, %k1
614 ; X64-NEXT: vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
617 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
618 %1 = bitcast i8 %__U to <8 x i1>
619 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
620 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
624 define <4 x i64> @test_mm256_shldi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
625 ; CHECK-LABEL: test_mm256_shldi_epi64:
626 ; CHECK: # %bb.0: # %entry
627 ; CHECK-NEXT: vpshldq $31, %ymm1, %ymm0, %ymm0
628 ; CHECK-NEXT: ret{{[l|q]}}
630 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 31, i64 31, i64 31, i64 31>)
634 define <2 x i64> @test_mm_mask_shldi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
635 ; X86-LABEL: test_mm_mask_shldi_epi64:
636 ; X86: # %bb.0: # %entry
637 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
638 ; X86-NEXT: kmovd %eax, %k1
639 ; X86-NEXT: vpshldq $47, %xmm2, %xmm1, %xmm0 {%k1}
642 ; X64-LABEL: test_mm_mask_shldi_epi64:
643 ; X64: # %bb.0: # %entry
644 ; X64-NEXT: kmovd %edi, %k1
645 ; X64-NEXT: vpshldq $47, %xmm2, %xmm1, %xmm0 {%k1}
648 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 47, i64 47>)
649 %1 = bitcast i8 %__U to <8 x i1>
650 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
651 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
655 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
657 define <2 x i64> @test_mm_maskz_shldi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
658 ; X86-LABEL: test_mm_maskz_shldi_epi64:
659 ; X86: # %bb.0: # %entry
660 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
661 ; X86-NEXT: kmovd %eax, %k1
662 ; X86-NEXT: vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
665 ; X64-LABEL: test_mm_maskz_shldi_epi64:
666 ; X64: # %bb.0: # %entry
667 ; X64-NEXT: kmovd %edi, %k1
668 ; X64-NEXT: vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
671 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 63, i64 63>)
672 %1 = bitcast i8 %__U to <8 x i1>
673 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
674 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
678 define <2 x i64> @test_mm_shldi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
679 ; CHECK-LABEL: test_mm_shldi_epi64:
680 ; CHECK: # %bb.0: # %entry
681 ; CHECK-NEXT: vpshldq $31, %xmm1, %xmm0, %xmm0
682 ; CHECK-NEXT: ret{{[l|q]}}
684 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 31, i64 31>)
688 define <4 x i64> @test_mm256_mask_shldi_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
689 ; X86-LABEL: test_mm256_mask_shldi_epi32:
690 ; X86: # %bb.0: # %entry
691 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
692 ; X86-NEXT: kmovd %eax, %k1
693 ; X86-NEXT: vpshldd $7, %ymm2, %ymm1, %ymm0 {%k1}
696 ; X64-LABEL: test_mm256_mask_shldi_epi32:
697 ; X64: # %bb.0: # %entry
698 ; X64-NEXT: kmovd %edi, %k1
699 ; X64-NEXT: vpshldd $7, %ymm2, %ymm1, %ymm0 {%k1}
702 %0 = bitcast <4 x i64> %__A to <8 x i32>
703 %1 = bitcast <4 x i64> %__B to <8 x i32>
704 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>)
705 %3 = bitcast <4 x i64> %__S to <8 x i32>
706 %4 = bitcast i8 %__U to <8 x i1>
707 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
708 %6 = bitcast <8 x i32> %5 to <4 x i64>
712 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
714 define <4 x i64> @test_mm256_maskz_shldi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
715 ; X86-LABEL: test_mm256_maskz_shldi_epi32:
716 ; X86: # %bb.0: # %entry
717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
718 ; X86-NEXT: kmovd %eax, %k1
719 ; X86-NEXT: vpshldd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
722 ; X64-LABEL: test_mm256_maskz_shldi_epi32:
723 ; X64: # %bb.0: # %entry
724 ; X64-NEXT: kmovd %edi, %k1
725 ; X64-NEXT: vpshldd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
728 %0 = bitcast <4 x i64> %__A to <8 x i32>
729 %1 = bitcast <4 x i64> %__B to <8 x i32>
730 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>)
731 %3 = bitcast i8 %__U to <8 x i1>
732 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
733 %5 = bitcast <8 x i32> %4 to <4 x i64>
737 define <4 x i64> @test_mm256_shldi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
738 ; CHECK-LABEL: test_mm256_shldi_epi32:
739 ; CHECK: # %bb.0: # %entry
740 ; CHECK-NEXT: vpshldd $31, %ymm1, %ymm0, %ymm0
741 ; CHECK-NEXT: ret{{[l|q]}}
743 %0 = bitcast <4 x i64> %__A to <8 x i32>
744 %1 = bitcast <4 x i64> %__B to <8 x i32>
745 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>)
746 %3 = bitcast <8 x i32> %2 to <4 x i64>
750 define <2 x i64> @test_mm_mask_shldi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
751 ; X86-LABEL: test_mm_mask_shldi_epi32:
752 ; X86: # %bb.0: # %entry
753 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
754 ; X86-NEXT: kmovd %eax, %k1
755 ; X86-NEXT: vpshldd $7, %xmm2, %xmm1, %xmm0 {%k1}
758 ; X64-LABEL: test_mm_mask_shldi_epi32:
759 ; X64: # %bb.0: # %entry
760 ; X64-NEXT: kmovd %edi, %k1
761 ; X64-NEXT: vpshldd $7, %xmm2, %xmm1, %xmm0 {%k1}
764 %0 = bitcast <2 x i64> %__A to <4 x i32>
765 %1 = bitcast <2 x i64> %__B to <4 x i32>
766 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 7, i32 7, i32 7, i32 7>)
767 %3 = bitcast <2 x i64> %__S to <4 x i32>
768 %4 = bitcast i8 %__U to <8 x i1>
769 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
770 %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
771 %6 = bitcast <4 x i32> %5 to <2 x i64>
775 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
777 define <2 x i64> @test_mm_maskz_shldi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
778 ; X86-LABEL: test_mm_maskz_shldi_epi32:
779 ; X86: # %bb.0: # %entry
780 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
781 ; X86-NEXT: kmovd %eax, %k1
782 ; X86-NEXT: vpshldd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
785 ; X64-LABEL: test_mm_maskz_shldi_epi32:
786 ; X64: # %bb.0: # %entry
787 ; X64-NEXT: kmovd %edi, %k1
788 ; X64-NEXT: vpshldd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
791 %0 = bitcast <2 x i64> %__A to <4 x i32>
792 %1 = bitcast <2 x i64> %__B to <4 x i32>
793 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
794 %3 = bitcast i8 %__U to <8 x i1>
795 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
796 %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
797 %5 = bitcast <4 x i32> %4 to <2 x i64>
801 define <2 x i64> @test_mm_shldi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
802 ; CHECK-LABEL: test_mm_shldi_epi32:
803 ; CHECK: # %bb.0: # %entry
804 ; CHECK-NEXT: vpshldd $31, %xmm1, %xmm0, %xmm0
805 ; CHECK-NEXT: ret{{[l|q]}}
807 %0 = bitcast <2 x i64> %__A to <4 x i32>
808 %1 = bitcast <2 x i64> %__B to <4 x i32>
809 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
810 %3 = bitcast <4 x i32> %2 to <2 x i64>
814 define <4 x i64> @test_mm256_mask_shldi_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
815 ; X86-LABEL: test_mm256_mask_shldi_epi16:
816 ; X86: # %bb.0: # %entry
817 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
818 ; X86-NEXT: vpshldw $3, %ymm2, %ymm1, %ymm0 {%k1}
821 ; X64-LABEL: test_mm256_mask_shldi_epi16:
822 ; X64: # %bb.0: # %entry
823 ; X64-NEXT: kmovd %edi, %k1
824 ; X64-NEXT: vpshldw $3, %ymm2, %ymm1, %ymm0 {%k1}
827 %0 = bitcast <4 x i64> %__A to <16 x i16>
828 %1 = bitcast <4 x i64> %__B to <16 x i16>
829 %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
830 %3 = bitcast <4 x i64> %__S to <16 x i16>
831 %4 = bitcast i16 %__U to <16 x i1>
832 %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
833 %6 = bitcast <16 x i16> %5 to <4 x i64>
837 declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
839 define <4 x i64> @test_mm256_maskz_shldi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
840 ; X86-LABEL: test_mm256_maskz_shldi_epi16:
841 ; X86: # %bb.0: # %entry
842 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
843 ; X86-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
846 ; X64-LABEL: test_mm256_maskz_shldi_epi16:
847 ; X64: # %bb.0: # %entry
848 ; X64-NEXT: kmovd %edi, %k1
849 ; X64-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
852 %0 = bitcast <4 x i64> %__A to <16 x i16>
853 %1 = bitcast <4 x i64> %__B to <16 x i16>
854 %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
855 %3 = bitcast i16 %__U to <16 x i1>
856 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
857 %5 = bitcast <16 x i16> %4 to <4 x i64>
861 define <4 x i64> @test_mm256_shldi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
862 ; CHECK-LABEL: test_mm256_shldi_epi16:
863 ; CHECK: # %bb.0: # %entry
864 ; CHECK-NEXT: vpshldw $15, %ymm1, %ymm0, %ymm0
865 ; CHECK-NEXT: ret{{[l|q]}}
867 %0 = bitcast <4 x i64> %__A to <16 x i16>
868 %1 = bitcast <4 x i64> %__B to <16 x i16>
869 %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
870 %3 = bitcast <16 x i16> %2 to <4 x i64>
874 define <2 x i64> @test_mm_mask_shldi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
875 ; X86-LABEL: test_mm_mask_shldi_epi16:
876 ; X86: # %bb.0: # %entry
877 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
878 ; X86-NEXT: kmovd %eax, %k1
879 ; X86-NEXT: vpshldw $3, %xmm2, %xmm1, %xmm0 {%k1}
882 ; X64-LABEL: test_mm_mask_shldi_epi16:
883 ; X64: # %bb.0: # %entry
884 ; X64-NEXT: kmovd %edi, %k1
885 ; X64-NEXT: vpshldw $3, %xmm2, %xmm1, %xmm0 {%k1}
888 %0 = bitcast <2 x i64> %__A to <8 x i16>
889 %1 = bitcast <2 x i64> %__B to <8 x i16>
890 %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
891 %3 = bitcast <2 x i64> %__S to <8 x i16>
892 %4 = bitcast i8 %__U to <8 x i1>
893 %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
894 %6 = bitcast <8 x i16> %5 to <2 x i64>
898 declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
900 define <2 x i64> @test_mm_maskz_shldi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
901 ; X86-LABEL: test_mm_maskz_shldi_epi16:
902 ; X86: # %bb.0: # %entry
903 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
904 ; X86-NEXT: kmovd %eax, %k1
905 ; X86-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
908 ; X64-LABEL: test_mm_maskz_shldi_epi16:
909 ; X64: # %bb.0: # %entry
910 ; X64-NEXT: kmovd %edi, %k1
911 ; X64-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
914 %0 = bitcast <2 x i64> %__A to <8 x i16>
915 %1 = bitcast <2 x i64> %__B to <8 x i16>
916 %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
917 %3 = bitcast i8 %__U to <8 x i1>
918 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
919 %5 = bitcast <8 x i16> %4 to <2 x i64>
923 define <2 x i64> @test_mm_shldi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
924 ; CHECK-LABEL: test_mm_shldi_epi16:
925 ; CHECK: # %bb.0: # %entry
926 ; CHECK-NEXT: vpshldw $15, %xmm1, %xmm0, %xmm0
927 ; CHECK-NEXT: ret{{[l|q]}}
929 %0 = bitcast <2 x i64> %__A to <8 x i16>
930 %1 = bitcast <2 x i64> %__B to <8 x i16>
931 %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
932 %3 = bitcast <8 x i16> %2 to <2 x i64>
936 define <4 x i64> @test_mm256_mask_shrdi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
937 ; X86-LABEL: test_mm256_mask_shrdi_epi64:
938 ; X86: # %bb.0: # %entry
939 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
940 ; X86-NEXT: kmovd %eax, %k1
941 ; X86-NEXT: vpshrdq $47, %ymm2, %ymm1, %ymm0 {%k1}
944 ; X64-LABEL: test_mm256_mask_shrdi_epi64:
945 ; X64: # %bb.0: # %entry
946 ; X64-NEXT: kmovd %edi, %k1
947 ; X64-NEXT: vpshrdq $47, %ymm2, %ymm1, %ymm0 {%k1}
950 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 47, i64 47, i64 47, i64 47>)
951 %1 = bitcast i8 %__U to <8 x i1>
952 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
953 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
957 declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
959 define <4 x i64> @test_mm256_maskz_shrdi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
960 ; X86-LABEL: test_mm256_maskz_shrdi_epi64:
961 ; X86: # %bb.0: # %entry
962 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
963 ; X86-NEXT: kmovd %eax, %k1
964 ; X86-NEXT: vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
967 ; X64-LABEL: test_mm256_maskz_shrdi_epi64:
968 ; X64: # %bb.0: # %entry
969 ; X64-NEXT: kmovd %edi, %k1
970 ; X64-NEXT: vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
973 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
974 %1 = bitcast i8 %__U to <8 x i1>
975 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
976 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
980 define <4 x i64> @test_mm256_shrdi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
981 ; CHECK-LABEL: test_mm256_shrdi_epi64:
982 ; CHECK: # %bb.0: # %entry
983 ; CHECK-NEXT: vpshrdq $31, %ymm1, %ymm0, %ymm0
984 ; CHECK-NEXT: ret{{[l|q]}}
986 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 31, i64 31, i64 31, i64 31>)
990 define <2 x i64> @test_mm_mask_shrdi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
991 ; X86-LABEL: test_mm_mask_shrdi_epi64:
992 ; X86: # %bb.0: # %entry
993 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
994 ; X86-NEXT: kmovd %eax, %k1
995 ; X86-NEXT: vpshrdq $47, %xmm2, %xmm1, %xmm0 {%k1}
998 ; X64-LABEL: test_mm_mask_shrdi_epi64:
999 ; X64: # %bb.0: # %entry
1000 ; X64-NEXT: kmovd %edi, %k1
1001 ; X64-NEXT: vpshrdq $47, %xmm2, %xmm1, %xmm0 {%k1}
1004 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 47, i64 47>)
1005 %1 = bitcast i8 %__U to <8 x i1>
1006 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1007 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
1011 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
1013 define <2 x i64> @test_mm_maskz_shrdi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1014 ; X86-LABEL: test_mm_maskz_shrdi_epi64:
1015 ; X86: # %bb.0: # %entry
1016 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1017 ; X86-NEXT: kmovd %eax, %k1
1018 ; X86-NEXT: vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
1021 ; X64-LABEL: test_mm_maskz_shrdi_epi64:
1022 ; X64: # %bb.0: # %entry
1023 ; X64-NEXT: kmovd %edi, %k1
1024 ; X64-NEXT: vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
1027 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 63, i64 63>)
1028 %1 = bitcast i8 %__U to <8 x i1>
1029 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1030 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
1034 define <2 x i64> @test_mm_shrdi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
1035 ; CHECK-LABEL: test_mm_shrdi_epi64:
1036 ; CHECK: # %bb.0: # %entry
1037 ; CHECK-NEXT: vpshrdq $31, %xmm1, %xmm0, %xmm0
1038 ; CHECK-NEXT: ret{{[l|q]}}
1040 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 31, i64 31>)
1044 define <4 x i64> @test_mm256_mask_shrdi_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1045 ; X86-LABEL: test_mm256_mask_shrdi_epi32:
1046 ; X86: # %bb.0: # %entry
1047 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1048 ; X86-NEXT: kmovd %eax, %k1
1049 ; X86-NEXT: vpshrdd $7, %ymm2, %ymm1, %ymm0 {%k1}
1052 ; X64-LABEL: test_mm256_mask_shrdi_epi32:
1053 ; X64: # %bb.0: # %entry
1054 ; X64-NEXT: kmovd %edi, %k1
1055 ; X64-NEXT: vpshrdd $7, %ymm2, %ymm1, %ymm0 {%k1}
1058 %0 = bitcast <4 x i64> %__A to <8 x i32>
1059 %1 = bitcast <4 x i64> %__B to <8 x i32>
1060 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>)
1061 %3 = bitcast <4 x i64> %__S to <8 x i32>
1062 %4 = bitcast i8 %__U to <8 x i1>
1063 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
1064 %6 = bitcast <8 x i32> %5 to <4 x i64>
1068 declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
1070 define <4 x i64> @test_mm256_maskz_shrdi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1071 ; X86-LABEL: test_mm256_maskz_shrdi_epi32:
1072 ; X86: # %bb.0: # %entry
1073 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1074 ; X86-NEXT: kmovd %eax, %k1
1075 ; X86-NEXT: vpshrdd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
1078 ; X64-LABEL: test_mm256_maskz_shrdi_epi32:
1079 ; X64: # %bb.0: # %entry
1080 ; X64-NEXT: kmovd %edi, %k1
1081 ; X64-NEXT: vpshrdd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
1084 %0 = bitcast <4 x i64> %__A to <8 x i32>
1085 %1 = bitcast <4 x i64> %__B to <8 x i32>
1086 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>)
1087 %3 = bitcast i8 %__U to <8 x i1>
1088 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
1089 %5 = bitcast <8 x i32> %4 to <4 x i64>
1093 define <4 x i64> @test_mm256_shrdi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
1094 ; CHECK-LABEL: test_mm256_shrdi_epi32:
1095 ; CHECK: # %bb.0: # %entry
1096 ; CHECK-NEXT: vpshrdd $31, %ymm1, %ymm0, %ymm0
1097 ; CHECK-NEXT: ret{{[l|q]}}
1099 %0 = bitcast <4 x i64> %__A to <8 x i32>
1100 %1 = bitcast <4 x i64> %__B to <8 x i32>
1101 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>)
1102 %3 = bitcast <8 x i32> %2 to <4 x i64>
1106 define <2 x i64> @test_mm_mask_shrdi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1107 ; X86-LABEL: test_mm_mask_shrdi_epi32:
1108 ; X86: # %bb.0: # %entry
1109 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1110 ; X86-NEXT: kmovd %eax, %k1
1111 ; X86-NEXT: vpshrdd $7, %xmm2, %xmm1, %xmm0 {%k1}
1114 ; X64-LABEL: test_mm_mask_shrdi_epi32:
1115 ; X64: # %bb.0: # %entry
1116 ; X64-NEXT: kmovd %edi, %k1
1117 ; X64-NEXT: vpshrdd $7, %xmm2, %xmm1, %xmm0 {%k1}
1120 %0 = bitcast <2 x i64> %__A to <4 x i32>
1121 %1 = bitcast <2 x i64> %__B to <4 x i32>
1122 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 7, i32 7, i32 7, i32 7>)
1123 %3 = bitcast <2 x i64> %__S to <4 x i32>
1124 %4 = bitcast i8 %__U to <8 x i1>
1125 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1126 %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
1127 %6 = bitcast <4 x i32> %5 to <2 x i64>
1131 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
1133 define <2 x i64> @test_mm_maskz_shrdi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1134 ; X86-LABEL: test_mm_maskz_shrdi_epi32:
1135 ; X86: # %bb.0: # %entry
1136 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1137 ; X86-NEXT: kmovd %eax, %k1
1138 ; X86-NEXT: vpshrdd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
1141 ; X64-LABEL: test_mm_maskz_shrdi_epi32:
1142 ; X64: # %bb.0: # %entry
1143 ; X64-NEXT: kmovd %edi, %k1
1144 ; X64-NEXT: vpshrdd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
1147 %0 = bitcast <2 x i64> %__A to <4 x i32>
1148 %1 = bitcast <2 x i64> %__B to <4 x i32>
1149 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
1150 %3 = bitcast i8 %__U to <8 x i1>
1151 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1152 %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
1153 %5 = bitcast <4 x i32> %4 to <2 x i64>
1157 define <2 x i64> @test_mm_shrdi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
1158 ; CHECK-LABEL: test_mm_shrdi_epi32:
1159 ; CHECK: # %bb.0: # %entry
1160 ; CHECK-NEXT: vpshrdd $31, %xmm1, %xmm0, %xmm0
1161 ; CHECK-NEXT: ret{{[l|q]}}
1163 %0 = bitcast <2 x i64> %__A to <4 x i32>
1164 %1 = bitcast <2 x i64> %__B to <4 x i32>
1165 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
1166 %3 = bitcast <4 x i32> %2 to <2 x i64>
1170 define <4 x i64> @test_mm256_mask_shrdi_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1171 ; X86-LABEL: test_mm256_mask_shrdi_epi16:
1172 ; X86: # %bb.0: # %entry
1173 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1174 ; X86-NEXT: vpshrdw $3, %ymm2, %ymm1, %ymm0 {%k1}
1177 ; X64-LABEL: test_mm256_mask_shrdi_epi16:
1178 ; X64: # %bb.0: # %entry
1179 ; X64-NEXT: kmovd %edi, %k1
1180 ; X64-NEXT: vpshrdw $3, %ymm2, %ymm1, %ymm0 {%k1}
1183 %0 = bitcast <4 x i64> %__A to <16 x i16>
1184 %1 = bitcast <4 x i64> %__B to <16 x i16>
1185 %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
1186 %3 = bitcast <4 x i64> %__S to <16 x i16>
1187 %4 = bitcast i16 %__U to <16 x i1>
1188 %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
1189 %6 = bitcast <16 x i16> %5 to <4 x i64>
1193 declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
1195 define <4 x i64> @test_mm256_maskz_shrdi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1196 ; X86-LABEL: test_mm256_maskz_shrdi_epi16:
1197 ; X86: # %bb.0: # %entry
1198 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1199 ; X86-NEXT: vpshrdw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
1202 ; X64-LABEL: test_mm256_maskz_shrdi_epi16:
1203 ; X64: # %bb.0: # %entry
1204 ; X64-NEXT: kmovd %edi, %k1
1205 ; X64-NEXT: vpshrdw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
1208 %0 = bitcast <4 x i64> %__A to <16 x i16>
1209 %1 = bitcast <4 x i64> %__B to <16 x i16>
1210 %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1211 %3 = bitcast i16 %__U to <16 x i1>
1212 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
1213 %5 = bitcast <16 x i16> %4 to <4 x i64>
1217 define <4 x i64> @test_mm256_shrdi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
1218 ; CHECK-LABEL: test_mm256_shrdi_epi16:
1219 ; CHECK: # %bb.0: # %entry
1220 ; CHECK-NEXT: vpshrdw $15, %ymm1, %ymm0, %ymm0
1221 ; CHECK-NEXT: ret{{[l|q]}}
1223 %0 = bitcast <4 x i64> %__A to <16 x i16>
1224 %1 = bitcast <4 x i64> %__B to <16 x i16>
1225 %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
1226 %3 = bitcast <16 x i16> %2 to <4 x i64>
1230 define <2 x i64> @test_mm_mask_shrdi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1231 ; X86-LABEL: test_mm_mask_shrdi_epi16:
1232 ; X86: # %bb.0: # %entry
1233 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1234 ; X86-NEXT: kmovd %eax, %k1
1235 ; X86-NEXT: vpshrdw $3, %xmm2, %xmm1, %xmm0 {%k1}
1238 ; X64-LABEL: test_mm_mask_shrdi_epi16:
1239 ; X64: # %bb.0: # %entry
1240 ; X64-NEXT: kmovd %edi, %k1
1241 ; X64-NEXT: vpshrdw $3, %xmm2, %xmm1, %xmm0 {%k1}
1244 %0 = bitcast <2 x i64> %__A to <8 x i16>
1245 %1 = bitcast <2 x i64> %__B to <8 x i16>
1246 %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
1247 %3 = bitcast <2 x i64> %__S to <8 x i16>
1248 %4 = bitcast i8 %__U to <8 x i1>
1249 %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
1250 %6 = bitcast <8 x i16> %5 to <2 x i64>
1254 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
1256 define <2 x i64> @test_mm_maskz_shrdi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1257 ; X86-LABEL: test_mm_maskz_shrdi_epi16:
1258 ; X86: # %bb.0: # %entry
1259 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1260 ; X86-NEXT: kmovd %eax, %k1
1261 ; X86-NEXT: vpshrdw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
1264 ; X64-LABEL: test_mm_maskz_shrdi_epi16:
1265 ; X64: # %bb.0: # %entry
1266 ; X64-NEXT: kmovd %edi, %k1
1267 ; X64-NEXT: vpshrdw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
1270 %0 = bitcast <2 x i64> %__A to <8 x i16>
1271 %1 = bitcast <2 x i64> %__B to <8 x i16>
1272 %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1273 %3 = bitcast i8 %__U to <8 x i1>
1274 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
1275 %5 = bitcast <8 x i16> %4 to <2 x i64>
1279 define <2 x i64> @test_mm_shrdi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
1280 ; CHECK-LABEL: test_mm_shrdi_epi16:
1281 ; CHECK: # %bb.0: # %entry
1282 ; CHECK-NEXT: vpshrdw $15, %xmm1, %xmm0, %xmm0
1283 ; CHECK-NEXT: ret{{[l|q]}}
1285 %0 = bitcast <2 x i64> %__A to <8 x i16>
1286 %1 = bitcast <2 x i64> %__B to <8 x i16>
1287 %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
1288 %3 = bitcast <8 x i16> %2 to <2 x i64>
1292 define <4 x i64> @test_mm256_mask_shldv_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1293 ; X86-LABEL: test_mm256_mask_shldv_epi64:
1294 ; X86: # %bb.0: # %entry
1295 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1296 ; X86-NEXT: kmovd %eax, %k1
1297 ; X86-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 {%k1}
1300 ; X64-LABEL: test_mm256_mask_shldv_epi64:
1301 ; X64: # %bb.0: # %entry
1302 ; X64-NEXT: kmovd %edi, %k1
1303 ; X64-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 {%k1}
1306 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1307 %1 = bitcast i8 %__U to <8 x i1>
1308 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1309 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__S
1313 define <4 x i64> @test_mm256_maskz_shldv_epi64(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1314 ; X86-LABEL: test_mm256_maskz_shldv_epi64:
1315 ; X86: # %bb.0: # %entry
1316 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1317 ; X86-NEXT: kmovd %eax, %k1
1318 ; X86-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1321 ; X64-LABEL: test_mm256_maskz_shldv_epi64:
1322 ; X64: # %bb.0: # %entry
1323 ; X64-NEXT: kmovd %edi, %k1
1324 ; X64-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1327 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1328 %1 = bitcast i8 %__U to <8 x i1>
1329 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1330 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
1334 define <4 x i64> @test_mm256_shldv_epi64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1335 ; CHECK-LABEL: test_mm256_shldv_epi64:
1336 ; CHECK: # %bb.0: # %entry
1337 ; CHECK-NEXT: vpshldvq %ymm2, %ymm1, %ymm0
1338 ; CHECK-NEXT: ret{{[l|q]}}
1340 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1344 define <2 x i64> @test_mm_mask_shldv_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1345 ; X86-LABEL: test_mm_mask_shldv_epi64:
1346 ; X86: # %bb.0: # %entry
1347 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1348 ; X86-NEXT: kmovd %eax, %k1
1349 ; X86-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 {%k1}
1352 ; X64-LABEL: test_mm_mask_shldv_epi64:
1353 ; X64: # %bb.0: # %entry
1354 ; X64-NEXT: kmovd %edi, %k1
1355 ; X64-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 {%k1}
1358 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1359 %1 = bitcast i8 %__U to <8 x i1>
1360 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1361 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__S
1365 define <2 x i64> @test_mm_maskz_shldv_epi64(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1366 ; X86-LABEL: test_mm_maskz_shldv_epi64:
1367 ; X86: # %bb.0: # %entry
1368 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1369 ; X86-NEXT: kmovd %eax, %k1
1370 ; X86-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1373 ; X64-LABEL: test_mm_maskz_shldv_epi64:
1374 ; X64: # %bb.0: # %entry
1375 ; X64-NEXT: kmovd %edi, %k1
1376 ; X64-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1379 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1380 %1 = bitcast i8 %__U to <8 x i1>
1381 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1382 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
1386 define <2 x i64> @test_mm_shldv_epi64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1387 ; CHECK-LABEL: test_mm_shldv_epi64:
1388 ; CHECK: # %bb.0: # %entry
1389 ; CHECK-NEXT: vpshldvq %xmm2, %xmm1, %xmm0
1390 ; CHECK-NEXT: ret{{[l|q]}}
1392 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1396 define <4 x i64> @test_mm256_mask_shldv_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1397 ; X86-LABEL: test_mm256_mask_shldv_epi32:
1398 ; X86: # %bb.0: # %entry
1399 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1400 ; X86-NEXT: kmovd %eax, %k1
1401 ; X86-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 {%k1}
1404 ; X64-LABEL: test_mm256_mask_shldv_epi32:
1405 ; X64: # %bb.0: # %entry
1406 ; X64-NEXT: kmovd %edi, %k1
1407 ; X64-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 {%k1}
1410 %0 = bitcast <4 x i64> %__S to <8 x i32>
1411 %1 = bitcast <4 x i64> %__A to <8 x i32>
1412 %2 = bitcast <4 x i64> %__B to <8 x i32>
1413 %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1414 %4 = bitcast i8 %__U to <8 x i1>
1415 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
1416 %6 = bitcast <8 x i32> %5 to <4 x i64>
1420 define <4 x i64> @test_mm256_maskz_shldv_epi32(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1421 ; X86-LABEL: test_mm256_maskz_shldv_epi32:
1422 ; X86: # %bb.0: # %entry
1423 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1424 ; X86-NEXT: kmovd %eax, %k1
1425 ; X86-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1428 ; X64-LABEL: test_mm256_maskz_shldv_epi32:
1429 ; X64: # %bb.0: # %entry
1430 ; X64-NEXT: kmovd %edi, %k1
1431 ; X64-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1434 %0 = bitcast <4 x i64> %__S to <8 x i32>
1435 %1 = bitcast <4 x i64> %__A to <8 x i32>
1436 %2 = bitcast <4 x i64> %__B to <8 x i32>
1437 %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1438 %4 = bitcast i8 %__U to <8 x i1>
1439 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
1440 %6 = bitcast <8 x i32> %5 to <4 x i64>
1444 define <4 x i64> @test_mm256_shldv_epi32(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1445 ; CHECK-LABEL: test_mm256_shldv_epi32:
1446 ; CHECK: # %bb.0: # %entry
1447 ; CHECK-NEXT: vpshldvd %ymm2, %ymm1, %ymm0
1448 ; CHECK-NEXT: ret{{[l|q]}}
1450 %0 = bitcast <4 x i64> %__S to <8 x i32>
1451 %1 = bitcast <4 x i64> %__A to <8 x i32>
1452 %2 = bitcast <4 x i64> %__B to <8 x i32>
1453 %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1454 %4 = bitcast <8 x i32> %3 to <4 x i64>
1458 define <2 x i64> @test_mm_mask_shldv_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1459 ; X86-LABEL: test_mm_mask_shldv_epi32:
1460 ; X86: # %bb.0: # %entry
1461 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1462 ; X86-NEXT: kmovd %eax, %k1
1463 ; X86-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 {%k1}
1466 ; X64-LABEL: test_mm_mask_shldv_epi32:
1467 ; X64: # %bb.0: # %entry
1468 ; X64-NEXT: kmovd %edi, %k1
1469 ; X64-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 {%k1}
1472 %0 = bitcast <2 x i64> %__S to <4 x i32>
1473 %1 = bitcast <2 x i64> %__A to <4 x i32>
1474 %2 = bitcast <2 x i64> %__B to <4 x i32>
1475 %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1476 %4 = bitcast i8 %__U to <8 x i1>
1477 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1478 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
1479 %6 = bitcast <4 x i32> %5 to <2 x i64>
1483 define <2 x i64> @test_mm_maskz_shldv_epi32(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1484 ; X86-LABEL: test_mm_maskz_shldv_epi32:
1485 ; X86: # %bb.0: # %entry
1486 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1487 ; X86-NEXT: kmovd %eax, %k1
1488 ; X86-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1491 ; X64-LABEL: test_mm_maskz_shldv_epi32:
1492 ; X64: # %bb.0: # %entry
1493 ; X64-NEXT: kmovd %edi, %k1
1494 ; X64-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1497 %0 = bitcast <2 x i64> %__S to <4 x i32>
1498 %1 = bitcast <2 x i64> %__A to <4 x i32>
1499 %2 = bitcast <2 x i64> %__B to <4 x i32>
1500 %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1501 %4 = bitcast i8 %__U to <8 x i1>
1502 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1503 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
1504 %6 = bitcast <4 x i32> %5 to <2 x i64>
1508 define <2 x i64> @test_mm_shldv_epi32(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1509 ; CHECK-LABEL: test_mm_shldv_epi32:
1510 ; CHECK: # %bb.0: # %entry
1511 ; CHECK-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
1512 ; CHECK-NEXT: ret{{[l|q]}}
1514 %0 = bitcast <2 x i64> %__S to <4 x i32>
1515 %1 = bitcast <2 x i64> %__A to <4 x i32>
1516 %2 = bitcast <2 x i64> %__B to <4 x i32>
1517 %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1518 %4 = bitcast <4 x i32> %3 to <2 x i64>
1522 define <4 x i64> @test_mm256_mask_shldv_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1523 ; X86-LABEL: test_mm256_mask_shldv_epi16:
1524 ; X86: # %bb.0: # %entry
1525 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1526 ; X86-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 {%k1}
1529 ; X64-LABEL: test_mm256_mask_shldv_epi16:
1530 ; X64: # %bb.0: # %entry
1531 ; X64-NEXT: kmovd %edi, %k1
1532 ; X64-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 {%k1}
1535 %0 = bitcast <4 x i64> %__S to <16 x i16>
1536 %1 = bitcast <4 x i64> %__A to <16 x i16>
1537 %2 = bitcast <4 x i64> %__B to <16 x i16>
1538 %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1539 %4 = bitcast i16 %__U to <16 x i1>
1540 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %0
1541 %6 = bitcast <16 x i16> %5 to <4 x i64>
1545 define <4 x i64> @test_mm256_maskz_shldv_epi16(i16 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1546 ; X86-LABEL: test_mm256_maskz_shldv_epi16:
1547 ; X86: # %bb.0: # %entry
1548 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1549 ; X86-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1552 ; X64-LABEL: test_mm256_maskz_shldv_epi16:
1553 ; X64: # %bb.0: # %entry
1554 ; X64-NEXT: kmovd %edi, %k1
1555 ; X64-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1558 %0 = bitcast <4 x i64> %__S to <16 x i16>
1559 %1 = bitcast <4 x i64> %__A to <16 x i16>
1560 %2 = bitcast <4 x i64> %__B to <16 x i16>
1561 %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1562 %4 = bitcast i16 %__U to <16 x i1>
1563 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
1564 %6 = bitcast <16 x i16> %5 to <4 x i64>
1568 define <4 x i64> @test_mm256_shldv_epi16(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1569 ; CHECK-LABEL: test_mm256_shldv_epi16:
1570 ; CHECK: # %bb.0: # %entry
1571 ; CHECK-NEXT: vpshldvw %ymm2, %ymm1, %ymm0
1572 ; CHECK-NEXT: ret{{[l|q]}}
1574 %0 = bitcast <4 x i64> %__S to <16 x i16>
1575 %1 = bitcast <4 x i64> %__A to <16 x i16>
1576 %2 = bitcast <4 x i64> %__B to <16 x i16>
1577 %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1578 %4 = bitcast <16 x i16> %3 to <4 x i64>
1582 define <2 x i64> @test_mm_mask_shldv_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1583 ; X86-LABEL: test_mm_mask_shldv_epi16:
1584 ; X86: # %bb.0: # %entry
1585 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1586 ; X86-NEXT: kmovd %eax, %k1
1587 ; X86-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 {%k1}
1590 ; X64-LABEL: test_mm_mask_shldv_epi16:
1591 ; X64: # %bb.0: # %entry
1592 ; X64-NEXT: kmovd %edi, %k1
1593 ; X64-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 {%k1}
1596 %0 = bitcast <2 x i64> %__S to <8 x i16>
1597 %1 = bitcast <2 x i64> %__A to <8 x i16>
1598 %2 = bitcast <2 x i64> %__B to <8 x i16>
1599 %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1600 %4 = bitcast i8 %__U to <8 x i1>
1601 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %0
1602 %6 = bitcast <8 x i16> %5 to <2 x i64>
1606 define <2 x i64> @test_mm_maskz_shldv_epi16(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1607 ; X86-LABEL: test_mm_maskz_shldv_epi16:
1608 ; X86: # %bb.0: # %entry
1609 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1610 ; X86-NEXT: kmovd %eax, %k1
1611 ; X86-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1614 ; X64-LABEL: test_mm_maskz_shldv_epi16:
1615 ; X64: # %bb.0: # %entry
1616 ; X64-NEXT: kmovd %edi, %k1
1617 ; X64-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1620 %0 = bitcast <2 x i64> %__S to <8 x i16>
1621 %1 = bitcast <2 x i64> %__A to <8 x i16>
1622 %2 = bitcast <2 x i64> %__B to <8 x i16>
1623 %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1624 %4 = bitcast i8 %__U to <8 x i1>
1625 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
1626 %6 = bitcast <8 x i16> %5 to <2 x i64>
1630 define <2 x i64> @test_mm_shldv_epi16(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1631 ; CHECK-LABEL: test_mm_shldv_epi16:
1632 ; CHECK: # %bb.0: # %entry
1633 ; CHECK-NEXT: vpshldvw %xmm2, %xmm1, %xmm0
1634 ; CHECK-NEXT: ret{{[l|q]}}
1636 %0 = bitcast <2 x i64> %__S to <8 x i16>
1637 %1 = bitcast <2 x i64> %__A to <8 x i16>
1638 %2 = bitcast <2 x i64> %__B to <8 x i16>
1639 %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1640 %4 = bitcast <8 x i16> %3 to <2 x i64>
1644 define <4 x i64> @test_mm256_mask_shrdv_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1645 ; X86-LABEL: test_mm256_mask_shrdv_epi64:
1646 ; X86: # %bb.0: # %entry
1647 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1648 ; X86-NEXT: kmovd %eax, %k1
1649 ; X86-NEXT: vpshrdvq %ymm2, %ymm1, %ymm0 {%k1}
1652 ; X64-LABEL: test_mm256_mask_shrdv_epi64:
1653 ; X64: # %bb.0: # %entry
1654 ; X64-NEXT: kmovd %edi, %k1
1655 ; X64-NEXT: vpshrdvq %ymm2, %ymm1, %ymm0 {%k1}
1658 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1659 %1 = bitcast i8 %__U to <8 x i1>
1660 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1661 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__S
1665 define <4 x i64> @test_mm256_maskz_shrdv_epi64(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1666 ; X86-LABEL: test_mm256_maskz_shrdv_epi64:
1667 ; X86: # %bb.0: # %entry
1668 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1669 ; X86-NEXT: kmovd %eax, %k1
1670 ; X86-NEXT: vpshrdvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1673 ; X64-LABEL: test_mm256_maskz_shrdv_epi64:
1674 ; X64: # %bb.0: # %entry
1675 ; X64-NEXT: kmovd %edi, %k1
1676 ; X64-NEXT: vpshrdvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1679 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1680 %1 = bitcast i8 %__U to <8 x i1>
1681 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1682 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
1686 define <4 x i64> @test_mm256_shrdv_epi64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1687 ; CHECK-LABEL: test_mm256_shrdv_epi64:
1688 ; CHECK: # %bb.0: # %entry
1689 ; CHECK-NEXT: vpshrdvq %ymm2, %ymm1, %ymm0
1690 ; CHECK-NEXT: ret{{[l|q]}}
1692 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1696 define <2 x i64> @test_mm_mask_shrdv_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1697 ; X86-LABEL: test_mm_mask_shrdv_epi64:
1698 ; X86: # %bb.0: # %entry
1699 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1700 ; X86-NEXT: kmovd %eax, %k1
1701 ; X86-NEXT: vpshrdvq %xmm2, %xmm1, %xmm0 {%k1}
1704 ; X64-LABEL: test_mm_mask_shrdv_epi64:
1705 ; X64: # %bb.0: # %entry
1706 ; X64-NEXT: kmovd %edi, %k1
1707 ; X64-NEXT: vpshrdvq %xmm2, %xmm1, %xmm0 {%k1}
1710 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1711 %1 = bitcast i8 %__U to <8 x i1>
1712 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1713 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__S
1717 define <2 x i64> @test_mm_maskz_shrdv_epi64(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1718 ; X86-LABEL: test_mm_maskz_shrdv_epi64:
1719 ; X86: # %bb.0: # %entry
1720 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1721 ; X86-NEXT: kmovd %eax, %k1
1722 ; X86-NEXT: vpshrdvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1725 ; X64-LABEL: test_mm_maskz_shrdv_epi64:
1726 ; X64: # %bb.0: # %entry
1727 ; X64-NEXT: kmovd %edi, %k1
1728 ; X64-NEXT: vpshrdvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1731 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1732 %1 = bitcast i8 %__U to <8 x i1>
1733 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1734 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
1738 define <2 x i64> @test_mm_shrdv_epi64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1739 ; CHECK-LABEL: test_mm_shrdv_epi64:
1740 ; CHECK: # %bb.0: # %entry
1741 ; CHECK-NEXT: vpshrdvq %xmm2, %xmm1, %xmm0
1742 ; CHECK-NEXT: ret{{[l|q]}}
1744 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1748 define <4 x i64> @test_mm256_mask_shrdv_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1749 ; X86-LABEL: test_mm256_mask_shrdv_epi32:
1750 ; X86: # %bb.0: # %entry
1751 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1752 ; X86-NEXT: kmovd %eax, %k1
1753 ; X86-NEXT: vpshrdvd %ymm2, %ymm1, %ymm0 {%k1}
1756 ; X64-LABEL: test_mm256_mask_shrdv_epi32:
1757 ; X64: # %bb.0: # %entry
1758 ; X64-NEXT: kmovd %edi, %k1
1759 ; X64-NEXT: vpshrdvd %ymm2, %ymm1, %ymm0 {%k1}
1762 %0 = bitcast <4 x i64> %__S to <8 x i32>
1763 %1 = bitcast <4 x i64> %__A to <8 x i32>
1764 %2 = bitcast <4 x i64> %__B to <8 x i32>
1765 %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1766 %4 = bitcast i8 %__U to <8 x i1>
1767 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
1768 %6 = bitcast <8 x i32> %5 to <4 x i64>
1772 define <4 x i64> @test_mm256_maskz_shrdv_epi32(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1773 ; X86-LABEL: test_mm256_maskz_shrdv_epi32:
1774 ; X86: # %bb.0: # %entry
1775 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1776 ; X86-NEXT: kmovd %eax, %k1
1777 ; X86-NEXT: vpshrdvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1780 ; X64-LABEL: test_mm256_maskz_shrdv_epi32:
1781 ; X64: # %bb.0: # %entry
1782 ; X64-NEXT: kmovd %edi, %k1
1783 ; X64-NEXT: vpshrdvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1786 %0 = bitcast <4 x i64> %__S to <8 x i32>
1787 %1 = bitcast <4 x i64> %__A to <8 x i32>
1788 %2 = bitcast <4 x i64> %__B to <8 x i32>
1789 %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1790 %4 = bitcast i8 %__U to <8 x i1>
1791 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
1792 %6 = bitcast <8 x i32> %5 to <4 x i64>
1796 define <4 x i64> @test_mm256_shrdv_epi32(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1797 ; CHECK-LABEL: test_mm256_shrdv_epi32:
1798 ; CHECK: # %bb.0: # %entry
1799 ; CHECK-NEXT: vpshrdvd %ymm2, %ymm1, %ymm0
1800 ; CHECK-NEXT: ret{{[l|q]}}
1802 %0 = bitcast <4 x i64> %__S to <8 x i32>
1803 %1 = bitcast <4 x i64> %__A to <8 x i32>
1804 %2 = bitcast <4 x i64> %__B to <8 x i32>
1805 %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1806 %4 = bitcast <8 x i32> %3 to <4 x i64>
1810 define <2 x i64> @test_mm_mask_shrdv_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1811 ; X86-LABEL: test_mm_mask_shrdv_epi32:
1812 ; X86: # %bb.0: # %entry
1813 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1814 ; X86-NEXT: kmovd %eax, %k1
1815 ; X86-NEXT: vpshrdvd %xmm2, %xmm1, %xmm0 {%k1}
1818 ; X64-LABEL: test_mm_mask_shrdv_epi32:
1819 ; X64: # %bb.0: # %entry
1820 ; X64-NEXT: kmovd %edi, %k1
1821 ; X64-NEXT: vpshrdvd %xmm2, %xmm1, %xmm0 {%k1}
1824 %0 = bitcast <2 x i64> %__S to <4 x i32>
1825 %1 = bitcast <2 x i64> %__A to <4 x i32>
1826 %2 = bitcast <2 x i64> %__B to <4 x i32>
1827 %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1828 %4 = bitcast i8 %__U to <8 x i1>
1829 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1830 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
1831 %6 = bitcast <4 x i32> %5 to <2 x i64>
1835 define <2 x i64> @test_mm_maskz_shrdv_epi32(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1836 ; X86-LABEL: test_mm_maskz_shrdv_epi32:
1837 ; X86: # %bb.0: # %entry
1838 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1839 ; X86-NEXT: kmovd %eax, %k1
1840 ; X86-NEXT: vpshrdvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1843 ; X64-LABEL: test_mm_maskz_shrdv_epi32:
1844 ; X64: # %bb.0: # %entry
1845 ; X64-NEXT: kmovd %edi, %k1
1846 ; X64-NEXT: vpshrdvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1849 %0 = bitcast <2 x i64> %__S to <4 x i32>
1850 %1 = bitcast <2 x i64> %__A to <4 x i32>
1851 %2 = bitcast <2 x i64> %__B to <4 x i32>
1852 %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1853 %4 = bitcast i8 %__U to <8 x i1>
1854 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1855 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
1856 %6 = bitcast <4 x i32> %5 to <2 x i64>
1860 define <2 x i64> @test_mm_shrdv_epi32(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1861 ; CHECK-LABEL: test_mm_shrdv_epi32:
1862 ; CHECK: # %bb.0: # %entry
1863 ; CHECK-NEXT: vpshrdvd %xmm2, %xmm1, %xmm0
1864 ; CHECK-NEXT: ret{{[l|q]}}
1866 %0 = bitcast <2 x i64> %__S to <4 x i32>
1867 %1 = bitcast <2 x i64> %__A to <4 x i32>
1868 %2 = bitcast <2 x i64> %__B to <4 x i32>
1869 %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1870 %4 = bitcast <4 x i32> %3 to <2 x i64>
1874 define <4 x i64> @test_mm256_mask_shrdv_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1875 ; X86-LABEL: test_mm256_mask_shrdv_epi16:
1876 ; X86: # %bb.0: # %entry
1877 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1878 ; X86-NEXT: vpshrdvw %ymm2, %ymm1, %ymm0 {%k1}
1881 ; X64-LABEL: test_mm256_mask_shrdv_epi16:
1882 ; X64: # %bb.0: # %entry
1883 ; X64-NEXT: kmovd %edi, %k1
1884 ; X64-NEXT: vpshrdvw %ymm2, %ymm1, %ymm0 {%k1}
1887 %0 = bitcast <4 x i64> %__S to <16 x i16>
1888 %1 = bitcast <4 x i64> %__A to <16 x i16>
1889 %2 = bitcast <4 x i64> %__B to <16 x i16>
1890 %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1891 %4 = bitcast i16 %__U to <16 x i1>
1892 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %0
1893 %6 = bitcast <16 x i16> %5 to <4 x i64>
1897 define <4 x i64> @test_mm256_maskz_shrdv_epi16(i16 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1898 ; X86-LABEL: test_mm256_maskz_shrdv_epi16:
1899 ; X86: # %bb.0: # %entry
1900 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1901 ; X86-NEXT: vpshrdvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1904 ; X64-LABEL: test_mm256_maskz_shrdv_epi16:
1905 ; X64: # %bb.0: # %entry
1906 ; X64-NEXT: kmovd %edi, %k1
1907 ; X64-NEXT: vpshrdvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1910 %0 = bitcast <4 x i64> %__S to <16 x i16>
1911 %1 = bitcast <4 x i64> %__A to <16 x i16>
1912 %2 = bitcast <4 x i64> %__B to <16 x i16>
1913 %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1914 %4 = bitcast i16 %__U to <16 x i1>
1915 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
1916 %6 = bitcast <16 x i16> %5 to <4 x i64>
1920 define <4 x i64> @test_mm256_shrdv_epi16(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1921 ; CHECK-LABEL: test_mm256_shrdv_epi16:
1922 ; CHECK: # %bb.0: # %entry
1923 ; CHECK-NEXT: vpshrdvw %ymm2, %ymm1, %ymm0
1924 ; CHECK-NEXT: ret{{[l|q]}}
1926 %0 = bitcast <4 x i64> %__S to <16 x i16>
1927 %1 = bitcast <4 x i64> %__A to <16 x i16>
1928 %2 = bitcast <4 x i64> %__B to <16 x i16>
1929 %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1930 %4 = bitcast <16 x i16> %3 to <4 x i64>
1934 define <2 x i64> @test_mm_mask_shrdv_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1935 ; X86-LABEL: test_mm_mask_shrdv_epi16:
1936 ; X86: # %bb.0: # %entry
1937 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1938 ; X86-NEXT: kmovd %eax, %k1
1939 ; X86-NEXT: vpshrdvw %xmm2, %xmm1, %xmm0 {%k1}
1942 ; X64-LABEL: test_mm_mask_shrdv_epi16:
1943 ; X64: # %bb.0: # %entry
1944 ; X64-NEXT: kmovd %edi, %k1
1945 ; X64-NEXT: vpshrdvw %xmm2, %xmm1, %xmm0 {%k1}
1948 %0 = bitcast <2 x i64> %__S to <8 x i16>
1949 %1 = bitcast <2 x i64> %__A to <8 x i16>
1950 %2 = bitcast <2 x i64> %__B to <8 x i16>
1951 %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1952 %4 = bitcast i8 %__U to <8 x i1>
1953 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %0
1954 %6 = bitcast <8 x i16> %5 to <2 x i64>
1958 define <2 x i64> @test_mm_maskz_shrdv_epi16(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1959 ; X86-LABEL: test_mm_maskz_shrdv_epi16:
1960 ; X86: # %bb.0: # %entry
1961 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1962 ; X86-NEXT: kmovd %eax, %k1
1963 ; X86-NEXT: vpshrdvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1966 ; X64-LABEL: test_mm_maskz_shrdv_epi16:
1967 ; X64: # %bb.0: # %entry
1968 ; X64-NEXT: kmovd %edi, %k1
1969 ; X64-NEXT: vpshrdvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1972 %0 = bitcast <2 x i64> %__S to <8 x i16>
1973 %1 = bitcast <2 x i64> %__A to <8 x i16>
1974 %2 = bitcast <2 x i64> %__B to <8 x i16>
1975 %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1976 %4 = bitcast i8 %__U to <8 x i1>
1977 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
1978 %6 = bitcast <8 x i16> %5 to <2 x i64>
1982 define <2 x i64> @test_mm_shrdv_epi16(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1983 ; CHECK-LABEL: test_mm_shrdv_epi16:
1984 ; CHECK: # %bb.0: # %entry
1985 ; CHECK-NEXT: vpshrdvw %xmm2, %xmm1, %xmm0
1986 ; CHECK-NEXT: ret{{[l|q]}}
1988 %0 = bitcast <2 x i64> %__S to <8 x i16>
1989 %1 = bitcast <2 x i64> %__A to <8 x i16>
1990 %2 = bitcast <2 x i64> %__B to <8 x i16>
1991 %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1992 %4 = bitcast <8 x i16> %3 to <2 x i64>
1996 declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16>, <8 x i16>, i8)
1997 declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8>, <16 x i8>, i16)
1998 declare void @llvm.masked.compressstore.v8i16(<8 x i16>, i16*, <8 x i1>)
1999 declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>)
2000 declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16>, <8 x i16>, i8)
2001 declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8>, <16 x i8>, i16)
2002 declare <8 x i16> @llvm.masked.expandload.v8i16(i16*, <8 x i1>, <8 x i16>)
2003 declare <16 x i8> @llvm.masked.expandload.v16i8(i8*, <16 x i1>, <16 x i8>)
2004 declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16>, <16 x i16>, i16)
2005 declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8>, <32 x i8>, i32)
2006 declare void @llvm.masked.compressstore.v16i16(<16 x i16>, i16*, <16 x i1>)
2007 declare void @llvm.masked.compressstore.v32i8(<32 x i8>, i8*, <32 x i1>)
2008 declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16>, <16 x i16>, i16)
2009 declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8>, <32 x i8>, i32)
2010 declare <16 x i16> @llvm.masked.expandload.v16i16(i16*, <16 x i1>, <16 x i16>)
2011 declare <32 x i8> @llvm.masked.expandload.v32i8(i8*, <32 x i1>, <32 x i8>)