test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
   4
   5 define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
   6 ; X86-LABEL: test_mm256_2intersect_epi32:
   7 ; X86:       # %bb.0: # %entry
   8 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   9 ; X86-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
  10 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
  11 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
  12 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
  13 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
  14 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  15 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  16 ; X86-NEXT:    retl # encoding: [0xc3]
  17 ;
  18 ; X64-LABEL: test_mm256_2intersect_epi32:
  19 ; X64:       # %bb.0: # %entry
  20 ; X64-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
  21 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
  22 ; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
  23 ; X64-NEXT:    movb %cl, (%rdi) # encoding: [0x88,0x0f]
  24 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
  25 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  26 ; X64-NEXT:    retq # encoding: [0xc3]
  27 entry:
  28   %0 = bitcast <4 x i64> %a to <8 x i32>
  29   %1 = bitcast <4 x i64> %b to <8 x i32>
  30   %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1)
  31   %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
  32   %4 = bitcast i8* %m0 to <8 x i1>*
  33   store <8 x i1> %3, <8 x i1>* %4, align 8
  34   %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
  35   %6 = bitcast i8* %m1 to <8 x i1>*
  36   store <8 x i1> %5, <8 x i1>* %6, align 8
  37   ret void
  38 }
  39
  40 define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
  41 ; X86-LABEL: test_mm256_2intersect_epi64:
  42 ; X86:       # %bb.0: # %entry
  43 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
  44 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
  45 ; X86-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
  46 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
  47 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
  48 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
  49 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
  50 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
  51 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
  52 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
  53 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  54 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  55 ; X86-NEXT:    retl # encoding: [0xc3]
  56 ;
  57 ; X64-LABEL: test_mm256_2intersect_epi64:
  58 ; X64:       # %bb.0: # %entry
  59 ; X64-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
  60 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
  61 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
  62 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
  63 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
  64 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
  65 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
  66 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
  67 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
  68 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  69 ; X64-NEXT:    retq # encoding: [0xc3]
  70 entry:
  71   %0 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %a, <4 x i64> %b)
  72   %1 = extractvalue { <4 x i1>, <4 x i1> } %0, 0
  73   %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  74   %3 = bitcast <8 x i1> %2 to i8
  75   store i8 %3, i8* %m0, align 1
  76   %4 = extractvalue { <4 x i1>, <4 x i1> } %0, 1
  77   %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  78   %6 = bitcast <8 x i1> %5 to i8
  79   store i8 %6, i8* %m1, align 1
  80   ret void
  81 }
  82
  83 define void @test_mm256_2intersect_epi32_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
  84 ; X86-LABEL: test_mm256_2intersect_epi32_p:
  85 ; X86:       # %bb.0: # %entry
  86 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
  87 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
  88 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
  89 ; X86-NEXT:    vmovaps (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x02]
  90 ; X86-NEXT:    vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01]
  91 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
  92 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
  93 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
  94 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
  95 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  96 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  97 ; X86-NEXT:    retl # encoding: [0xc3]
  98 ;
  99 ; X64-LABEL: test_mm256_2intersect_epi32_p:
 100 ; X64:       # %bb.0: # %entry
 101 ; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
 102 ; X64-NEXT:    vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06]
 103 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 104 ; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
 105 ; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
 106 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 107 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 108 ; X64-NEXT:    retq # encoding: [0xc3]
 109 entry:
 110   %0 = bitcast <4 x i64>* %a to <8 x i32>*
 111   %1 = load <8 x i32>, <8 x i32>* %0, align 32
 112   %2 = bitcast <4 x i64>* %b to <8 x i32>*
 113   %3 = load <8 x i32>, <8 x i32>* %2, align 32
 114   %4 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %1, <8 x i32> %3)
 115   %5 = extractvalue { <8 x i1>, <8 x i1> } %4, 0
 116   %6 = bitcast i8* %m0 to <8 x i1>*
 117   store <8 x i1> %5, <8 x i1>* %6, align 8
 118   %7 = extractvalue { <8 x i1>, <8 x i1> } %4, 1
 119   %8 = bitcast i8* %m1 to <8 x i1>*
 120   store <8 x i1> %7, <8 x i1>* %8, align 8
 121   ret void
 122 }
 123
 124 define void @test_mm256_2intersect_epi64_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 125 ; X86-LABEL: test_mm256_2intersect_epi64_p:
 126 ; X86:       # %bb.0: # %entry
 127 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 128 ; X86-NEXT:    .cfi_def_cfa_offset 8
 129 ; X86-NEXT:    .cfi_offset %esi, -8
 130 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 131 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 132 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 133 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 134 ; X86-NEXT:    vmovaps (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x06]
 135 ; X86-NEXT:    vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02]
 136 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 137 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 138 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 139 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 140 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 141 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 142 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 143 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 144 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 145 ; X86-NEXT:    .cfi_def_cfa_offset 4
 146 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 147 ; X86-NEXT:    retl # encoding: [0xc3]
 148 ;
 149 ; X64-LABEL: test_mm256_2intersect_epi64_p:
 150 ; X64:       # %bb.0: # %entry
 151 ; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
 152 ; X64-NEXT:    vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06]
 153 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 154 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 155 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 156 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 157 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 158 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 159 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 160 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 161 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 162 ; X64-NEXT:    retq # encoding: [0xc3]
 163 entry:
 164   %0 = load <4 x i64>, <4 x i64>* %a, align 32
 165   %1 = load <4 x i64>, <4 x i64>* %b, align 32
 166   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %0, <4 x i64> %1)
 167   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 168   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 169   %5 = bitcast <8 x i1> %4 to i8
 170   store i8 %5, i8* %m0, align 1
 171   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 172   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 173   %8 = bitcast <8 x i1> %7 to i8
 174   store i8 %8, i8* %m1, align 1
 175   ret void
 176 }
 177
 178 define void @test_mm256_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 179 ; X86-LABEL: test_mm256_2intersect_epi32_b:
 180 ; X86:       # %bb.0: # %entry
 181 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
 182 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 183 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
 184 ; X86-NEXT:    vbroadcastss (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x02]
 185 ; X86-NEXT:    vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01]
 186 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 187 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
 188 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
 189 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
 190 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 191 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 192 ; X86-NEXT:    retl # encoding: [0xc3]
 193 ;
 194 ; X64-LABEL: test_mm256_2intersect_epi32_b:
 195 ; X64:       # %bb.0: # %entry
 196 ; X64-NEXT:    vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07]
 197 ; X64-NEXT:    vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06]
 198 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 199 ; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
 200 ; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
 201 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 202 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 203 ; X64-NEXT:    retq # encoding: [0xc3]
 204 entry:
 205   %0 = load i32, i32* %a, align 4
 206   %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i32 0
 207   %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
 208   %1 = load i32, i32* %b, align 4
 209   %vecinit.i.i2 = insertelement <8 x i32> undef, i32 %1, i32 0
 210   %vecinit7.i.i3 = shufflevector <8 x i32> %vecinit.i.i2, <8 x i32> undef, <8 x i32> zeroinitializer
 211   %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %vecinit7.i.i, <8 x i32> %vecinit7.i.i3)
 212   %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
 213   %4 = bitcast i8* %m0 to <8 x i1>*
 214   store <8 x i1> %3, <8 x i1>* %4, align 8
 215   %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
 216   %6 = bitcast i8* %m1 to <8 x i1>*
 217   store <8 x i1> %5, <8 x i1>* %6, align 8
 218   ret void
 219 }
 220
 221 define void @test_mm256_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 222 ; X86-LABEL: test_mm256_2intersect_epi64_b:
 223 ; X86:       # %bb.0: # %entry
 224 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 225 ; X86-NEXT:    .cfi_def_cfa_offset 8
 226 ; X86-NEXT:    .cfi_offset %esi, -8
 227 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 228 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 229 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 230 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 231 ; X86-NEXT:    vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06]
 232 ; X86-NEXT:    vbroadcastsd (%edx), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x0a]
 233 ; X86-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
 234 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 235 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 236 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 237 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 238 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 239 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 240 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 241 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 242 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 243 ; X86-NEXT:    .cfi_def_cfa_offset 4
 244 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 245 ; X86-NEXT:    retl # encoding: [0xc3]
 246 ;
 247 ; X64-LABEL: test_mm256_2intersect_epi64_b:
 248 ; X64:       # %bb.0: # %entry
 249 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x07]
 250 ; X64-NEXT:    vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06]
 251 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 252 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 253 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 254 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 255 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 256 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 257 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 258 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 259 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 260 ; X64-NEXT:    retq # encoding: [0xc3]
 261 entry:
 262   %0 = load i64, i64* %a, align 8
 263   %vecinit.i.i = insertelement <4 x i64> undef, i64 %0, i32 0
 264   %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
 265   %1 = load i64, i64* %b, align 8
 266   %vecinit.i.i2 = insertelement <4 x i64> undef, i64 %1, i32 0
 267   %vecinit3.i.i3 = shufflevector <4 x i64> %vecinit.i.i2, <4 x i64> undef, <4 x i32> zeroinitializer
 268   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %vecinit3.i.i, <4 x i64> %vecinit3.i.i3)
 269   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 270   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 271   %5 = bitcast <8 x i1> %4 to i8
 272   store i8 %5, i8* %m0, align 1
 273   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 274   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 275   %8 = bitcast <8 x i1> %7 to i8
 276   store i8 %8, i8* %m1, align 1
 277   ret void
 278 }
 279
 280 define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
 281 ; X86-LABEL: test_mm_2intersect_epi32:
 282 ; X86:       # %bb.0: # %entry
 283 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 284 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 285 ; X86-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
 286 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 287 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 288 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 289 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 290 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 291 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 292 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 293 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 294 ; X86-NEXT:    retl # encoding: [0xc3]
 295 ;
 296 ; X64-LABEL: test_mm_2intersect_epi32:
 297 ; X64:       # %bb.0: # %entry
 298 ; X64-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
 299 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 300 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 301 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 302 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
 303 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 304 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 305 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 306 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
 307 ; X64-NEXT:    retq # encoding: [0xc3]
 308 entry:
 309   %0 = bitcast <2 x i64> %a to <4 x i32>
 310   %1 = bitcast <2 x i64> %b to <4 x i32>
 311   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1)
 312   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 313   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 314   %5 = bitcast <8 x i1> %4 to i8
 315   store i8 %5, i8* %m0, align 1
 316   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 317   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 318   %8 = bitcast <8 x i1> %7 to i8
 319   store i8 %8, i8* %m1, align 1
 320   ret void
 321 }
 322
 323 define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
 324 ; X86-LABEL: test_mm_2intersect_epi64:
 325 ; X86:       # %bb.0: # %entry
 326 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 327 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 328 ; X86-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 329 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 330 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 331 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 332 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 333 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 334 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 335 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 336 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 337 ; X86-NEXT:    retl # encoding: [0xc3]
 338 ;
 339 ; X64-LABEL: test_mm_2intersect_epi64:
 340 ; X64:       # %bb.0: # %entry
 341 ; X64-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 342 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 343 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 344 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 345 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
 346 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 347 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 348 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 349 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
 350 ; X64-NEXT:    retq # encoding: [0xc3]
 351 entry:
 352   %0 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %a, <2 x i64> %b)
 353   %1 = extractvalue { <2 x i1>, <2 x i1> } %0, 0
 354   %2 = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 355   %3 = bitcast <8 x i1> %2 to i8
 356   store i8 %3, i8* %m0, align 1
 357   %4 = extractvalue { <2 x i1>, <2 x i1> } %0, 1
 358   %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 359   %6 = bitcast <8 x i1> %5 to i8
 360   store i8 %6, i8* %m1, align 1
 361   ret void
 362 }
 363
 364 define void @test_mm_2intersect_epi32_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 365 ; X86-LABEL: test_mm_2intersect_epi32_p:
 366 ; X86:       # %bb.0: # %entry
 367 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 368 ; X86-NEXT:    .cfi_def_cfa_offset 8
 369 ; X86-NEXT:    .cfi_offset %esi, -8
 370 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 371 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 372 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 373 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 374 ; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
 375 ; X86-NEXT:    vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02]
 376 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 377 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 378 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 379 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 380 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 381 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 382 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 383 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 384 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 385 ; X86-NEXT:    .cfi_def_cfa_offset 4
 386 ; X86-NEXT:    retl # encoding: [0xc3]
 387 ;
 388 ; X64-LABEL: test_mm_2intersect_epi32_p:
 389 ; X64:       # %bb.0: # %entry
 390 ; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 391 ; X64-NEXT:    vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06]
 392 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 393 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 394 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 395 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 396 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 397 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 398 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 399 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 400 ; X64-NEXT:    retq # encoding: [0xc3]
 401 entry:
 402   %0 = bitcast <2 x i64>* %a to <4 x i32>*
 403   %1 = load <4 x i32>, <4 x i32>* %0, align 16
 404   %2 = bitcast <2 x i64>* %b to <4 x i32>*
 405   %3 = load <4 x i32>, <4 x i32>* %2, align 16
 406   %4 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %1, <4 x i32> %3)
 407   %5 = extractvalue { <4 x i1>, <4 x i1> } %4, 0
 408   %6 = shufflevector <4 x i1> %5, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 409   %7 = bitcast <8 x i1> %6 to i8
 410   store i8 %7, i8* %m0, align 1
 411   %8 = extractvalue { <4 x i1>, <4 x i1> } %4, 1
 412   %9 = shufflevector <4 x i1> %8, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 413   %10 = bitcast <8 x i1> %9 to i8
 414   store i8 %10, i8* %m1, align 1
 415   ret void
 416 }
 417
 418 define void @test_mm_2intersect_epi64_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 419 ; X86-LABEL: test_mm_2intersect_epi64_p:
 420 ; X86:       # %bb.0: # %entry
 421 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 422 ; X86-NEXT:    .cfi_def_cfa_offset 8
 423 ; X86-NEXT:    .cfi_offset %esi, -8
 424 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 425 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 426 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 427 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 428 ; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
 429 ; X86-NEXT:    vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02]
 430 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 431 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 432 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 433 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 434 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 435 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 436 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 437 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 438 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 439 ; X86-NEXT:    .cfi_def_cfa_offset 4
 440 ; X86-NEXT:    retl # encoding: [0xc3]
 441 ;
 442 ; X64-LABEL: test_mm_2intersect_epi64_p:
 443 ; X64:       # %bb.0: # %entry
 444 ; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 445 ; X64-NEXT:    vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06]
 446 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 447 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 448 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 449 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 450 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 451 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 452 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 453 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 454 ; X64-NEXT:    retq # encoding: [0xc3]
 455 entry:
 456   %0 = load <2 x i64>, <2 x i64>* %a, align 16
 457   %1 = load <2 x i64>, <2 x i64>* %b, align 16
 458   %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %0, <2 x i64> %1)
 459   %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
 460   %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 461   %5 = bitcast <8 x i1> %4 to i8
 462   store i8 %5, i8* %m0, align 1
 463   %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
 464   %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 465   %8 = bitcast <8 x i1> %7 to i8
 466   store i8 %8, i8* %m1, align 1
 467   ret void
 468 }
 469
 470 define void @test_mm_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 471 ; X86-LABEL: test_mm_2intersect_epi32_b:
 472 ; X86:       # %bb.0: # %entry
 473 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 474 ; X86-NEXT:    .cfi_def_cfa_offset 8
 475 ; X86-NEXT:    .cfi_offset %esi, -8
 476 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 477 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 478 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 479 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 480 ; X86-NEXT:    vbroadcastss (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x06]
 481 ; X86-NEXT:    vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02]
 482 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 483 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 484 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 485 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 486 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 487 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 488 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 489 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 490 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 491 ; X86-NEXT:    .cfi_def_cfa_offset 4
 492 ; X86-NEXT:    retl # encoding: [0xc3]
 493 ;
 494 ; X64-LABEL: test_mm_2intersect_epi32_b:
 495 ; X64:       # %bb.0: # %entry
 496 ; X64-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
 497 ; X64-NEXT:    vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06]
 498 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 499 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 500 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 501 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 502 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 503 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 504 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 505 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 506 ; X64-NEXT:    retq # encoding: [0xc3]
 507 entry:
 508   %0 = load i32, i32* %a, align 4
 509   %vecinit.i.i = insertelement <4 x i32> undef, i32 %0, i32 0
 510   %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
 511   %1 = load i32, i32* %b, align 4
 512   %vecinit.i.i2 = insertelement <4 x i32> undef, i32 %1, i32 0
 513   %vecinit3.i.i3 = shufflevector <4 x i32> %vecinit.i.i2, <4 x i32> undef, <4 x i32> zeroinitializer
 514   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %vecinit3.i.i, <4 x i32> %vecinit3.i.i3)
 515   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 516   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 517   %5 = bitcast <8 x i1> %4 to i8
 518   store i8 %5, i8* %m0, align 1
 519   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 520   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 521   %8 = bitcast <8 x i1> %7 to i8
 522   store i8 %8, i8* %m1, align 1
 523   ret void
 524 }
 525
 526 define void @test_mm_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
 527 ; X86-LABEL: test_mm_2intersect_epi64_b:
 528 ; X86:       # %bb.0: # %entry
 529 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 530 ; X86-NEXT:    .cfi_def_cfa_offset 8
 531 ; X86-NEXT:    .cfi_offset %esi, -8
 532 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 533 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 534 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 535 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 536 ; X86-NEXT:    vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06]
 537 ; X86-NEXT:    # xmm0 = mem[0,0]
 538 ; X86-NEXT:    vmovddup (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x0a]
 539 ; X86-NEXT:    # xmm1 = mem[0,0]
 540 ; X86-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 541 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 542 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 543 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 544 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 545 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 546 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 547 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 548 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 549 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 550 ; X86-NEXT:    .cfi_def_cfa_offset 4
 551 ; X86-NEXT:    retl # encoding: [0xc3]
 552 ;
 553 ; X64-LABEL: test_mm_2intersect_epi64_b:
 554 ; X64:       # %bb.0: # %entry
 555 ; X64-NEXT:    vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07]
 556 ; X64-NEXT:    # xmm0 = mem[0,0]
 557 ; X64-NEXT:    vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06]
 558 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 559 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 560 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 561 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 562 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 563 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 564 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 565 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 566 ; X64-NEXT:    retq # encoding: [0xc3]
 567 entry:
 568   %0 = load i64, i64* %a, align 8
 569   %vecinit.i.i = insertelement <2 x i64> undef, i64 %0, i32 0
 570   %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
 571   %1 = load i64, i64* %b, align 8
 572   %vecinit.i.i2 = insertelement <2 x i64> undef, i64 %1, i32 0
 573   %vecinit1.i.i3 = shufflevector <2 x i64> %vecinit.i.i2, <2 x i64> undef, <2 x i32> zeroinitializer
 574   %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %vecinit1.i.i, <2 x i64> %vecinit1.i.i3)
 575   %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
 576   %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 577   %5 = bitcast <8 x i1> %4 to i8
 578   store i8 %5, i8* %m0, align 1
 579   %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
 580   %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 581   %8 = bitcast <8 x i1> %7 to i8
 582   store i8 %8, i8* %m1, align 1
 583   ret void
 584 }
 585
 586 declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
 587 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
 588 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
 589 declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)