test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
   4
   5 define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
   6 ; X86-LABEL: test_mm256_2intersect_epi32:
   7 ; X86:       # %bb.0: # %entry
   8 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   9 ; X86-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
  10 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
  11 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
  12 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
  13 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
  14 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  15 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  16 ; X86-NEXT:    retl # encoding: [0xc3]
  17 ;
  18 ; X64-LABEL: test_mm256_2intersect_epi32:
  19 ; X64:       # %bb.0: # %entry
  20 ; X64-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
  21 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
  22 ; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
  23 ; X64-NEXT:    movb %cl, (%rdi) # encoding: [0x88,0x0f]
  24 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
  25 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  26 ; X64-NEXT:    retq # encoding: [0xc3]
  27 entry:
  28   %0 = bitcast <4 x i64> %a to <8 x i32>
  29   %1 = bitcast <4 x i64> %b to <8 x i32>
  30   %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1)
  31   %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
  32   %4 = bitcast i8* %m0 to <8 x i1>*
  33   store <8 x i1> %3, <8 x i1>* %4, align 8
  34   %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
  35   %6 = bitcast i8* %m1 to <8 x i1>*
  36   store <8 x i1> %5, <8 x i1>* %6, align 8
  37   ret void
  38 }
  39
  40 define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
  41 ; X86-LABEL: test_mm256_2intersect_epi64:
  42 ; X86:       # %bb.0: # %entry
  43 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
  44 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
  45 ; X86-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
  46 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
  47 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
  48 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
  49 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
  50 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
  51 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
  52 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
  53 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  54 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  55 ; X86-NEXT:    retl # encoding: [0xc3]
  56 ;
  57 ; X64-LABEL: test_mm256_2intersect_epi64:
  58 ; X64:       # %bb.0: # %entry
  59 ; X64-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
  60 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
  61 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
  62 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
  63 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
  64 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
  65 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
  66 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
  67 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
  68 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  69 ; X64-NEXT:    retq # encoding: [0xc3]
  70 entry:
  71   %0 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %a, <4 x i64> %b)
  72   %1 = extractvalue { <4 x i1>, <4 x i1> } %0, 0
  73   %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  74   %3 = bitcast <8 x i1> %2 to i8
  75   store i8 %3, i8* %m0, align 1
  76   %4 = extractvalue { <4 x i1>, <4 x i1> } %0, 1
  77   %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  78   %6 = bitcast <8 x i1> %5 to i8
  79   store i8 %6, i8* %m1, align 1
  80   ret void
  81 }
  82
  83 define void @test_mm256_2intersect_epi32_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
  84 ; X86-LABEL: test_mm256_2intersect_epi32_p:
  85 ; X86:       # %bb.0: # %entry
  86 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
  87 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
  88 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
  89 ; X86-NEXT:    vmovaps (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x02]
  90 ; X86-NEXT:    vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01]
  91 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
  92 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
  93 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
  94 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
  95 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  96 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  97 ; X86-NEXT:    retl # encoding: [0xc3]
  98 ;
  99 ; X64-LABEL: test_mm256_2intersect_epi32_p:
 100 ; X64:       # %bb.0: # %entry
 101 ; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
 102 ; X64-NEXT:    vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06]
 103 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 104 ; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
 105 ; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
 106 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 107 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 108 ; X64-NEXT:    retq # encoding: [0xc3]
 109 entry:
 110   %0 = bitcast <4 x i64>* %a to <8 x i32>*
 111   %1 = load <8 x i32>, <8 x i32>* %0, align 32
 112   %2 = bitcast <4 x i64>* %b to <8 x i32>*
 113   %3 = load <8 x i32>, <8 x i32>* %2, align 32
 114   %4 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %1, <8 x i32> %3)
 115   %5 = extractvalue { <8 x i1>, <8 x i1> } %4, 0
 116   %6 = bitcast i8* %m0 to <8 x i1>*
 117   store <8 x i1> %5, <8 x i1>* %6, align 8
 118   %7 = extractvalue { <8 x i1>, <8 x i1> } %4, 1
 119   %8 = bitcast i8* %m1 to <8 x i1>*
 120   store <8 x i1> %7, <8 x i1>* %8, align 8
 121   ret void
 122 }
 123
 124 define void @test_mm256_2intersect_epi64_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 125 ; X86-LABEL: test_mm256_2intersect_epi64_p:
 126 ; X86:       # %bb.0: # %entry
 127 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 128 ; X86-NEXT:    .cfi_def_cfa_offset 8
 129 ; X86-NEXT:    .cfi_offset %esi, -8
 130 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 131 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 132 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 133 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 134 ; X86-NEXT:    vmovaps (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x06]
 135 ; X86-NEXT:    vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02]
 136 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 137 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 138 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 139 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 140 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 141 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 142 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 143 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 144 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 145 ; X86-NEXT:    .cfi_def_cfa_offset 4
 146 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 147 ; X86-NEXT:    retl # encoding: [0xc3]
 148 ;
 149 ; X64-LABEL: test_mm256_2intersect_epi64_p:
 150 ; X64:       # %bb.0: # %entry
 151 ; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
 152 ; X64-NEXT:    vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06]
 153 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 154 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 155 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 156 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 157 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 158 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 159 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 160 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 161 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 162 ; X64-NEXT:    retq # encoding: [0xc3]
 163 entry:
 164   %0 = load <4 x i64>, <4 x i64>* %a, align 32
 165   %1 = load <4 x i64>, <4 x i64>* %b, align 32
 166   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %0, <4 x i64> %1)
 167   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 168   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 169   %5 = bitcast <8 x i1> %4 to i8
 170   store i8 %5, i8* %m0, align 1
 171   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 172   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 173   %8 = bitcast <8 x i1> %7 to i8
 174   store i8 %8, i8* %m1, align 1
 175   ret void
 176 }
 177
 178 define void @test_mm256_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 179 ; X86-LABEL: test_mm256_2intersect_epi32_b:
 180 ; X86:       # %bb.0: # %entry
 181 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
 182 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 183 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
 184 ; X86-NEXT:    vbroadcastss (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x02]
 185 ; X86-NEXT:    vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01]
 186 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 187 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
 188 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
 189 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
 190 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 191 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 192 ; X86-NEXT:    retl # encoding: [0xc3]
 193 ;
 194 ; X64-LABEL: test_mm256_2intersect_epi32_b:
 195 ; X64:       # %bb.0: # %entry
 196 ; X64-NEXT:    vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07]
 197 ; X64-NEXT:    vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06]
 198 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 199 ; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
 200 ; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
 201 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 202 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 203 ; X64-NEXT:    retq # encoding: [0xc3]
 204 entry:
 205   %0 = load i32, i32* %a, align 4
 206   %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i32 0
 207   %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
 208   %1 = load i32, i32* %b, align 4
 209   %vecinit.i.i2 = insertelement <8 x i32> undef, i32 %1, i32 0
 210   %vecinit7.i.i3 = shufflevector <8 x i32> %vecinit.i.i2, <8 x i32> undef, <8 x i32> zeroinitializer
 211   %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %vecinit7.i.i, <8 x i32> %vecinit7.i.i3)
 212   %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
 213   %4 = bitcast i8* %m0 to <8 x i1>*
 214   store <8 x i1> %3, <8 x i1>* %4, align 8
 215   %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
 216   %6 = bitcast i8* %m1 to <8 x i1>*
 217   store <8 x i1> %5, <8 x i1>* %6, align 8
 218   ret void
 219 }
 220
 221 define void @test_mm256_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 222 ; X86-LABEL: test_mm256_2intersect_epi64_b:
 223 ; X86:       # %bb.0: # %entry
 224 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 225 ; X86-NEXT:    .cfi_def_cfa_offset 8
 226 ; X86-NEXT:    .cfi_offset %esi, -8
 227 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 228 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 229 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 230 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 231 ; X86-NEXT:    vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06]
 232 ; X86-NEXT:    vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02]
 233 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 234 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 235 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 236 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 237 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 238 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 239 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 240 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 241 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 242 ; X86-NEXT:    .cfi_def_cfa_offset 4
 243 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 244 ; X86-NEXT:    retl # encoding: [0xc3]
 245 ;
 246 ; X64-LABEL: test_mm256_2intersect_epi64_b:
 247 ; X64:       # %bb.0: # %entry
 248 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x07]
 249 ; X64-NEXT:    vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06]
 250 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 251 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 252 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 253 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 254 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 255 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 256 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 257 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 258 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 259 ; X64-NEXT:    retq # encoding: [0xc3]
 260 entry:
 261   %0 = load i64, i64* %a, align 8
 262   %vecinit.i.i = insertelement <4 x i64> undef, i64 %0, i32 0
 263   %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
 264   %1 = load i64, i64* %b, align 8
 265   %vecinit.i.i2 = insertelement <4 x i64> undef, i64 %1, i32 0
 266   %vecinit3.i.i3 = shufflevector <4 x i64> %vecinit.i.i2, <4 x i64> undef, <4 x i32> zeroinitializer
 267   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %vecinit3.i.i, <4 x i64> %vecinit3.i.i3)
 268   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 269   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 270   %5 = bitcast <8 x i1> %4 to i8
 271   store i8 %5, i8* %m0, align 1
 272   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 273   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 274   %8 = bitcast <8 x i1> %7 to i8
 275   store i8 %8, i8* %m1, align 1
 276   ret void
 277 }
 278
 279 define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
 280 ; X86-LABEL: test_mm_2intersect_epi32:
 281 ; X86:       # %bb.0: # %entry
 282 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 283 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 284 ; X86-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
 285 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 286 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 287 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 288 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 289 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 290 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 291 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 292 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 293 ; X86-NEXT:    retl # encoding: [0xc3]
 294 ;
 295 ; X64-LABEL: test_mm_2intersect_epi32:
 296 ; X64:       # %bb.0: # %entry
 297 ; X64-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
 298 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 299 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 300 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 301 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
 302 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 303 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 304 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 305 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
 306 ; X64-NEXT:    retq # encoding: [0xc3]
 307 entry:
 308   %0 = bitcast <2 x i64> %a to <4 x i32>
 309   %1 = bitcast <2 x i64> %b to <4 x i32>
 310   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1)
 311   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 312   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 313   %5 = bitcast <8 x i1> %4 to i8
 314   store i8 %5, i8* %m0, align 1
 315   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 316   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 317   %8 = bitcast <8 x i1> %7 to i8
 318   store i8 %8, i8* %m1, align 1
 319   ret void
 320 }
 321
 322 define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
 323 ; X86-LABEL: test_mm_2intersect_epi64:
 324 ; X86:       # %bb.0: # %entry
 325 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 326 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 327 ; X86-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 328 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 329 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 330 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 331 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 332 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 333 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 334 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 335 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 336 ; X86-NEXT:    retl # encoding: [0xc3]
 337 ;
 338 ; X64-LABEL: test_mm_2intersect_epi64:
 339 ; X64:       # %bb.0: # %entry
 340 ; X64-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 341 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 342 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 343 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 344 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
 345 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 346 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 347 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 348 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
 349 ; X64-NEXT:    retq # encoding: [0xc3]
 350 entry:
 351   %0 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %a, <2 x i64> %b)
 352   %1 = extractvalue { <2 x i1>, <2 x i1> } %0, 0
 353   %2 = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 354   %3 = bitcast <8 x i1> %2 to i8
 355   store i8 %3, i8* %m0, align 1
 356   %4 = extractvalue { <2 x i1>, <2 x i1> } %0, 1
 357   %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 358   %6 = bitcast <8 x i1> %5 to i8
 359   store i8 %6, i8* %m1, align 1
 360   ret void
 361 }
 362
 363 define void @test_mm_2intersect_epi32_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 364 ; X86-LABEL: test_mm_2intersect_epi32_p:
 365 ; X86:       # %bb.0: # %entry
 366 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 367 ; X86-NEXT:    .cfi_def_cfa_offset 8
 368 ; X86-NEXT:    .cfi_offset %esi, -8
 369 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 370 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 371 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 372 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 373 ; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
 374 ; X86-NEXT:    vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02]
 375 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 376 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 377 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 378 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 379 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 380 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 381 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 382 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 383 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 384 ; X86-NEXT:    .cfi_def_cfa_offset 4
 385 ; X86-NEXT:    retl # encoding: [0xc3]
 386 ;
 387 ; X64-LABEL: test_mm_2intersect_epi32_p:
 388 ; X64:       # %bb.0: # %entry
 389 ; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 390 ; X64-NEXT:    vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06]
 391 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 392 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 393 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 394 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 395 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 396 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 397 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 398 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 399 ; X64-NEXT:    retq # encoding: [0xc3]
 400 entry:
 401   %0 = bitcast <2 x i64>* %a to <4 x i32>*
 402   %1 = load <4 x i32>, <4 x i32>* %0, align 16
 403   %2 = bitcast <2 x i64>* %b to <4 x i32>*
 404   %3 = load <4 x i32>, <4 x i32>* %2, align 16
 405   %4 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %1, <4 x i32> %3)
 406   %5 = extractvalue { <4 x i1>, <4 x i1> } %4, 0
 407   %6 = shufflevector <4 x i1> %5, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 408   %7 = bitcast <8 x i1> %6 to i8
 409   store i8 %7, i8* %m0, align 1
 410   %8 = extractvalue { <4 x i1>, <4 x i1> } %4, 1
 411   %9 = shufflevector <4 x i1> %8, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 412   %10 = bitcast <8 x i1> %9 to i8
 413   store i8 %10, i8* %m1, align 1
 414   ret void
 415 }
 416
 417 define void @test_mm_2intersect_epi64_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 418 ; X86-LABEL: test_mm_2intersect_epi64_p:
 419 ; X86:       # %bb.0: # %entry
 420 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 421 ; X86-NEXT:    .cfi_def_cfa_offset 8
 422 ; X86-NEXT:    .cfi_offset %esi, -8
 423 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 424 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 425 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 426 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 427 ; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
 428 ; X86-NEXT:    vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02]
 429 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 430 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 431 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 432 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 433 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 434 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 435 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 436 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 437 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 438 ; X86-NEXT:    .cfi_def_cfa_offset 4
 439 ; X86-NEXT:    retl # encoding: [0xc3]
 440 ;
 441 ; X64-LABEL: test_mm_2intersect_epi64_p:
 442 ; X64:       # %bb.0: # %entry
 443 ; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 444 ; X64-NEXT:    vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06]
 445 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 446 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 447 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 448 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 449 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 450 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 451 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 452 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 453 ; X64-NEXT:    retq # encoding: [0xc3]
 454 entry:
 455   %0 = load <2 x i64>, <2 x i64>* %a, align 16
 456   %1 = load <2 x i64>, <2 x i64>* %b, align 16
 457   %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %0, <2 x i64> %1)
 458   %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
 459   %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 460   %5 = bitcast <8 x i1> %4 to i8
 461   store i8 %5, i8* %m0, align 1
 462   %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
 463   %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 464   %8 = bitcast <8 x i1> %7 to i8
 465   store i8 %8, i8* %m1, align 1
 466   ret void
 467 }
 468
 469 define void @test_mm_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 470 ; X86-LABEL: test_mm_2intersect_epi32_b:
 471 ; X86:       # %bb.0: # %entry
 472 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 473 ; X86-NEXT:    .cfi_def_cfa_offset 8
 474 ; X86-NEXT:    .cfi_offset %esi, -8
 475 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 476 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 477 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 478 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 479 ; X86-NEXT:    vbroadcastss (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x06]
 480 ; X86-NEXT:    vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02]
 481 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 482 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 483 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 484 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 485 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 486 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 487 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 488 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 489 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 490 ; X86-NEXT:    .cfi_def_cfa_offset 4
 491 ; X86-NEXT:    retl # encoding: [0xc3]
 492 ;
 493 ; X64-LABEL: test_mm_2intersect_epi32_b:
 494 ; X64:       # %bb.0: # %entry
 495 ; X64-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
 496 ; X64-NEXT:    vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06]
 497 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 498 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 499 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 500 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 501 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 502 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 503 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 504 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 505 ; X64-NEXT:    retq # encoding: [0xc3]
 506 entry:
 507   %0 = load i32, i32* %a, align 4
 508   %vecinit.i.i = insertelement <4 x i32> undef, i32 %0, i32 0
 509   %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
 510   %1 = load i32, i32* %b, align 4
 511   %vecinit.i.i2 = insertelement <4 x i32> undef, i32 %1, i32 0
 512   %vecinit3.i.i3 = shufflevector <4 x i32> %vecinit.i.i2, <4 x i32> undef, <4 x i32> zeroinitializer
 513   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %vecinit3.i.i, <4 x i32> %vecinit3.i.i3)
 514   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 515   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 516   %5 = bitcast <8 x i1> %4 to i8
 517   store i8 %5, i8* %m0, align 1
 518   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 519   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 520   %8 = bitcast <8 x i1> %7 to i8
 521   store i8 %8, i8* %m1, align 1
 522   ret void
 523 }
 524
 525 define void @test_mm_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 526 ; X86-LABEL: test_mm_2intersect_epi64_b:
 527 ; X86:       # %bb.0: # %entry
 528 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 529 ; X86-NEXT:    .cfi_def_cfa_offset 8
 530 ; X86-NEXT:    .cfi_offset %esi, -8
 531 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 532 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 533 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 534 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 535 ; X86-NEXT:    vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06]
 536 ; X86-NEXT:    # xmm0 = mem[0,0]
 537 ; X86-NEXT:    vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02]
 538 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 539 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 540 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 541 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 542 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 543 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 544 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 545 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 546 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 547 ; X86-NEXT:    .cfi_def_cfa_offset 4
 548 ; X86-NEXT:    retl # encoding: [0xc3]
 549 ;
 550 ; X64-LABEL: test_mm_2intersect_epi64_b:
 551 ; X64:       # %bb.0: # %entry
 552 ; X64-NEXT:    vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07]
 553 ; X64-NEXT:    # xmm0 = mem[0,0]
 554 ; X64-NEXT:    vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06]
 555 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 556 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 557 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 558 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 559 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 560 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 561 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 562 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 563 ; X64-NEXT:    retq # encoding: [0xc3]
 564 entry:
 565   %0 = load i64, i64* %a, align 8
 566   %vecinit.i.i = insertelement <2 x i64> undef, i64 %0, i32 0
 567   %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
 568   %1 = load i64, i64* %b, align 8
 569   %vecinit.i.i2 = insertelement <2 x i64> undef, i64 %1, i32 0
 570   %vecinit1.i.i3 = shufflevector <2 x i64> %vecinit.i.i2, <2 x i64> undef, <2 x i32> zeroinitializer
 571   %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %vecinit1.i.i, <2 x i64> %vecinit1.i.i3)
 572   %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
 573   %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 574   %5 = bitcast <8 x i1> %4 to i8
 575   store i8 %5, i8* %m0, align 1
 576   %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
 577   %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 578   %8 = bitcast <8 x i1> %7 to i8
 579   store i8 %8, i8* %m1, align 1
 580   ret void
 581 }
 582
 583 declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
 584 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
 585 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
 586 declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)