llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64
   4
   5 define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
   6 ; X86-LABEL: test_mm256_2intersect_epi32:
   7 ; X86:       # %bb.0: # %entry
   8 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   9 ; X86-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
  10 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
  11 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
  12 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
  13 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
  14 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  15 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  16 ; X86-NEXT:    retl # encoding: [0xc3]
  17 ;
  18 ; X64-LABEL: test_mm256_2intersect_epi32:
  19 ; X64:       # %bb.0: # %entry
  20 ; X64-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
  21 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
  22 ; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
  23 ; X64-NEXT:    movb %cl, (%rdi) # encoding: [0x88,0x0f]
  24 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
  25 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  26 ; X64-NEXT:    retq # encoding: [0xc3]
  27 entry:
  28   %0 = bitcast <4 x i64> %a to <8 x i32>
  29   %1 = bitcast <4 x i64> %b to <8 x i32>
  30   %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1)
  31   %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
  32   store <8 x i1> %3, ptr %m0, align 8
  33   %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
  34   store <8 x i1> %4, ptr %m1, align 8
  35   ret void
  36 }
  37
  38 define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
  39 ; X86-LABEL: test_mm256_2intersect_epi64:
  40 ; X86:       # %bb.0: # %entry
  41 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
  42 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
  43 ; X86-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
  44 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
  45 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
  46 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
  47 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
  48 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
  49 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
  50 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
  51 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  52 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  53 ; X86-NEXT:    retl # encoding: [0xc3]
  54 ;
  55 ; X64-LABEL: test_mm256_2intersect_epi64:
  56 ; X64:       # %bb.0: # %entry
  57 ; X64-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
  58 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
  59 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
  60 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
  61 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
  62 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
  63 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
  64 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
  65 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
  66 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  67 ; X64-NEXT:    retq # encoding: [0xc3]
  68 entry:
  69   %0 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %a, <4 x i64> %b)
  70   %1 = extractvalue { <4 x i1>, <4 x i1> } %0, 0
  71   %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  72   %3 = bitcast <8 x i1> %2 to i8
  73   store i8 %3, ptr %m0, align 1
  74   %4 = extractvalue { <4 x i1>, <4 x i1> } %0, 1
  75   %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  76   %6 = bitcast <8 x i1> %5 to i8
  77   store i8 %6, ptr %m1, align 1
  78   ret void
  79 }
  80
  81 define void @test_mm256_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
  82 ; X86-LABEL: test_mm256_2intersect_epi32_p:
  83 ; X86:       # %bb.0: # %entry
  84 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
  85 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
  86 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
  87 ; X86-NEXT:    vmovaps (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x02]
  88 ; X86-NEXT:    vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01]
  89 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
  90 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
  91 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
  92 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
  93 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
  94 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
  95 ; X86-NEXT:    retl # encoding: [0xc3]
  96 ;
  97 ; X64-LABEL: test_mm256_2intersect_epi32_p:
  98 ; X64:       # %bb.0: # %entry
  99 ; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
 100 ; X64-NEXT:    vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06]
 101 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 102 ; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
 103 ; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
 104 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 105 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 106 ; X64-NEXT:    retq # encoding: [0xc3]
 107 entry:
 108   %0 = load <8 x i32>, ptr %a, align 32
 109   %1 = load <8 x i32>, ptr %b, align 32
 110   %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1)
 111   %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
 112   store <8 x i1> %3, ptr %m0, align 8
 113   %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
 114   store <8 x i1> %4, ptr %m1, align 8
 115   ret void
 116 }
 117
 118 define void @test_mm256_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 119 ; X86-LABEL: test_mm256_2intersect_epi64_p:
 120 ; X86:       # %bb.0: # %entry
 121 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 122 ; X86-NEXT:    .cfi_def_cfa_offset 8
 123 ; X86-NEXT:    .cfi_offset %esi, -8
 124 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 125 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 126 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 127 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 128 ; X86-NEXT:    vmovaps (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x06]
 129 ; X86-NEXT:    vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02]
 130 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 131 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 132 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 133 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 134 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 135 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 136 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 137 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 138 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 139 ; X86-NEXT:    .cfi_def_cfa_offset 4
 140 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 141 ; X86-NEXT:    retl # encoding: [0xc3]
 142 ;
 143 ; X64-LABEL: test_mm256_2intersect_epi64_p:
 144 ; X64:       # %bb.0: # %entry
 145 ; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
 146 ; X64-NEXT:    vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06]
 147 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 148 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 149 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 150 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 151 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 152 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 153 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 154 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 155 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 156 ; X64-NEXT:    retq # encoding: [0xc3]
 157 entry:
 158   %0 = load <4 x i64>, ptr %a, align 32
 159   %1 = load <4 x i64>, ptr %b, align 32
 160   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %0, <4 x i64> %1)
 161   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 162   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 163   %5 = bitcast <8 x i1> %4 to i8
 164   store i8 %5, ptr %m0, align 1
 165   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 166   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 167   %8 = bitcast <8 x i1> %7 to i8
 168   store i8 %8, ptr %m1, align 1
 169   ret void
 170 }
 171
 172 define void @test_mm256_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 173 ; X86-LABEL: test_mm256_2intersect_epi32_b:
 174 ; X86:       # %bb.0: # %entry
 175 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
 176 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 177 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
 178 ; X86-NEXT:    vbroadcastss (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x02]
 179 ; X86-NEXT:    vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01]
 180 ; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
 181 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
 182 ; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
 183 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
 184 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 185 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 186 ; X86-NEXT:    retl # encoding: [0xc3]
 187 ;
 188 ; X64-LABEL: test_mm256_2intersect_epi32_b:
 189 ; X64:       # %bb.0: # %entry
 190 ; X64-NEXT:    vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07]
 191 ; X64-NEXT:    vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06]
 192 ; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 193 ; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
 194 ; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
 195 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 196 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 197 ; X64-NEXT:    retq # encoding: [0xc3]
 198 entry:
 199   %0 = load i32, ptr %a, align 4
 200   %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i32 0
 201   %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
 202   %1 = load i32, ptr %b, align 4
 203   %vecinit.i.i2 = insertelement <8 x i32> undef, i32 %1, i32 0
 204   %vecinit7.i.i3 = shufflevector <8 x i32> %vecinit.i.i2, <8 x i32> undef, <8 x i32> zeroinitializer
 205   %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %vecinit7.i.i, <8 x i32> %vecinit7.i.i3)
 206   %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
 207   store <8 x i1> %3, ptr %m0, align 8
 208   %4 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
 209   store <8 x i1> %4, ptr %m1, align 8
 210   ret void
 211 }
 212
 213 define void @test_mm256_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 214 ; X86-LABEL: test_mm256_2intersect_epi64_b:
 215 ; X86:       # %bb.0: # %entry
 216 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 217 ; X86-NEXT:    .cfi_def_cfa_offset 8
 218 ; X86-NEXT:    .cfi_offset %esi, -8
 219 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 220 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 221 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 222 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 223 ; X86-NEXT:    vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06]
 224 ; X86-NEXT:    vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02]
 225 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 226 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 227 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 228 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 229 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 230 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 231 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 232 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 233 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 234 ; X86-NEXT:    .cfi_def_cfa_offset 4
 235 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 236 ; X86-NEXT:    retl # encoding: [0xc3]
 237 ;
 238 ; X64-LABEL: test_mm256_2intersect_epi64_b:
 239 ; X64:       # %bb.0: # %entry
 240 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x07]
 241 ; X64-NEXT:    vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06]
 242 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 243 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 244 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 245 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 246 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 247 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 248 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 249 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 250 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 251 ; X64-NEXT:    retq # encoding: [0xc3]
 252 entry:
 253   %0 = load i64, ptr %a, align 8
 254   %vecinit.i.i = insertelement <4 x i64> undef, i64 %0, i32 0
 255   %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
 256   %1 = load i64, ptr %b, align 8
 257   %vecinit.i.i2 = insertelement <4 x i64> undef, i64 %1, i32 0
 258   %vecinit3.i.i3 = shufflevector <4 x i64> %vecinit.i.i2, <4 x i64> undef, <4 x i32> zeroinitializer
 259   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %vecinit3.i.i, <4 x i64> %vecinit3.i.i3)
 260   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 261   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 262   %5 = bitcast <8 x i1> %4 to i8
 263   store i8 %5, ptr %m0, align 1
 264   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 265   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 266   %8 = bitcast <8 x i1> %7 to i8
 267   store i8 %8, ptr %m1, align 1
 268   ret void
 269 }
 270
 271 define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 272 ; X86-LABEL: test_mm_2intersect_epi32:
 273 ; X86:       # %bb.0: # %entry
 274 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 275 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 276 ; X86-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
 277 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 278 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 279 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 280 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 281 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 282 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 283 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 284 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 285 ; X86-NEXT:    retl # encoding: [0xc3]
 286 ;
 287 ; X64-LABEL: test_mm_2intersect_epi32:
 288 ; X64:       # %bb.0: # %entry
 289 ; X64-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
 290 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 291 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 292 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 293 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
 294 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 295 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 296 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 297 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
 298 ; X64-NEXT:    retq # encoding: [0xc3]
 299 entry:
 300   %0 = bitcast <2 x i64> %a to <4 x i32>
 301   %1 = bitcast <2 x i64> %b to <4 x i32>
 302   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1)
 303   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 304   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 305   %5 = bitcast <8 x i1> %4 to i8
 306   store i8 %5, ptr %m0, align 1
 307   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 308   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 309   %8 = bitcast <8 x i1> %7 to i8
 310   store i8 %8, ptr %m1, align 1
 311   ret void
 312 }
 313
 314 define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 315 ; X86-LABEL: test_mm_2intersect_epi64:
 316 ; X86:       # %bb.0: # %entry
 317 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 318 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 319 ; X86-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 320 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 321 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 322 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 323 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 324 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 325 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 326 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 327 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 328 ; X86-NEXT:    retl # encoding: [0xc3]
 329 ;
 330 ; X64-LABEL: test_mm_2intersect_epi64:
 331 ; X64:       # %bb.0: # %entry
 332 ; X64-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 333 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 334 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 335 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 336 ; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
 337 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 338 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 339 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 340 ; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
 341 ; X64-NEXT:    retq # encoding: [0xc3]
 342 entry:
 343   %0 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %a, <2 x i64> %b)
 344   %1 = extractvalue { <2 x i1>, <2 x i1> } %0, 0
 345   %2 = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 346   %3 = bitcast <8 x i1> %2 to i8
 347   store i8 %3, ptr %m0, align 1
 348   %4 = extractvalue { <2 x i1>, <2 x i1> } %0, 1
 349   %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 350   %6 = bitcast <8 x i1> %5 to i8
 351   store i8 %6, ptr %m1, align 1
 352   ret void
 353 }
 354
 355 define void @test_mm_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 356 ; X86-LABEL: test_mm_2intersect_epi32_p:
 357 ; X86:       # %bb.0: # %entry
 358 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 359 ; X86-NEXT:    .cfi_def_cfa_offset 8
 360 ; X86-NEXT:    .cfi_offset %esi, -8
 361 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 362 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 363 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 364 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 365 ; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
 366 ; X86-NEXT:    vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02]
 367 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 368 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 369 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 370 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 371 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 372 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 373 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 374 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 375 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 376 ; X86-NEXT:    .cfi_def_cfa_offset 4
 377 ; X86-NEXT:    retl # encoding: [0xc3]
 378 ;
 379 ; X64-LABEL: test_mm_2intersect_epi32_p:
 380 ; X64:       # %bb.0: # %entry
 381 ; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 382 ; X64-NEXT:    vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06]
 383 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 384 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 385 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 386 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 387 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 388 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 389 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 390 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 391 ; X64-NEXT:    retq # encoding: [0xc3]
 392 entry:
 393   %0 = load <4 x i32>, ptr %a, align 16
 394   %1 = load <4 x i32>, ptr %b, align 16
 395   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1)
 396   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 397   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 398   %5 = bitcast <8 x i1> %4 to i8
 399   store i8 %5, ptr %m0, align 1
 400   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 401   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 402   %8 = bitcast <8 x i1> %7 to i8
 403   store i8 %8, ptr %m1, align 1
 404   ret void
 405 }
 406
 407 define void @test_mm_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 408 ; X86-LABEL: test_mm_2intersect_epi64_p:
 409 ; X86:       # %bb.0: # %entry
 410 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 411 ; X86-NEXT:    .cfi_def_cfa_offset 8
 412 ; X86-NEXT:    .cfi_offset %esi, -8
 413 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 414 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 415 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 416 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 417 ; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
 418 ; X86-NEXT:    vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02]
 419 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 420 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 421 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 422 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 423 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 424 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 425 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 426 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 427 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 428 ; X86-NEXT:    .cfi_def_cfa_offset 4
 429 ; X86-NEXT:    retl # encoding: [0xc3]
 430 ;
 431 ; X64-LABEL: test_mm_2intersect_epi64_p:
 432 ; X64:       # %bb.0: # %entry
 433 ; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 434 ; X64-NEXT:    vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06]
 435 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 436 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 437 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 438 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 439 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 440 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 441 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 442 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 443 ; X64-NEXT:    retq # encoding: [0xc3]
 444 entry:
 445   %0 = load <2 x i64>, ptr %a, align 16
 446   %1 = load <2 x i64>, ptr %b, align 16
 447   %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %0, <2 x i64> %1)
 448   %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
 449   %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 450   %5 = bitcast <8 x i1> %4 to i8
 451   store i8 %5, ptr %m0, align 1
 452   %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
 453   %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 454   %8 = bitcast <8 x i1> %7 to i8
 455   store i8 %8, ptr %m1, align 1
 456   ret void
 457 }
 458
 459 define void @test_mm_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 460 ; X86-LABEL: test_mm_2intersect_epi32_b:
 461 ; X86:       # %bb.0: # %entry
 462 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 463 ; X86-NEXT:    .cfi_def_cfa_offset 8
 464 ; X86-NEXT:    .cfi_offset %esi, -8
 465 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 466 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 467 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 468 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 469 ; X86-NEXT:    vbroadcastss (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x06]
 470 ; X86-NEXT:    vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02]
 471 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 472 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 473 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 474 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 475 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 476 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 477 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 478 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 479 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 480 ; X86-NEXT:    .cfi_def_cfa_offset 4
 481 ; X86-NEXT:    retl # encoding: [0xc3]
 482 ;
 483 ; X64-LABEL: test_mm_2intersect_epi32_b:
 484 ; X64:       # %bb.0: # %entry
 485 ; X64-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
 486 ; X64-NEXT:    vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06]
 487 ; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 488 ; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
 489 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 490 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 491 ; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 492 ; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 493 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 494 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 495 ; X64-NEXT:    retq # encoding: [0xc3]
 496 entry:
 497   %0 = load i32, ptr %a, align 4
 498   %vecinit.i.i = insertelement <4 x i32> undef, i32 %0, i32 0
 499   %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
 500   %1 = load i32, ptr %b, align 4
 501   %vecinit.i.i2 = insertelement <4 x i32> undef, i32 %1, i32 0
 502   %vecinit3.i.i3 = shufflevector <4 x i32> %vecinit.i.i2, <4 x i32> undef, <4 x i32> zeroinitializer
 503   %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %vecinit3.i.i, <4 x i32> %vecinit3.i.i3)
 504   %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
 505   %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 506   %5 = bitcast <8 x i1> %4 to i8
 507   store i8 %5, ptr %m0, align 1
 508   %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
 509   %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 510   %8 = bitcast <8 x i1> %7 to i8
 511   store i8 %8, ptr %m1, align 1
 512   ret void
 513 }
 514
 515 define void @test_mm_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 516 ; X86-LABEL: test_mm_2intersect_epi64_b:
 517 ; X86:       # %bb.0: # %entry
 518 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 519 ; X86-NEXT:    .cfi_def_cfa_offset 8
 520 ; X86-NEXT:    .cfi_offset %esi, -8
 521 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
 522 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 523 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 524 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
 525 ; X86-NEXT:    vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06]
 526 ; X86-NEXT:    # xmm0 = mem[0,0]
 527 ; X86-NEXT:    vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02]
 528 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 529 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 530 ; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
 531 ; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
 532 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 533 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 534 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 535 ; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
 536 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 537 ; X86-NEXT:    .cfi_def_cfa_offset 4
 538 ; X86-NEXT:    retl # encoding: [0xc3]
 539 ;
 540 ; X64-LABEL: test_mm_2intersect_epi64_b:
 541 ; X64:       # %bb.0: # %entry
 542 ; X64-NEXT:    vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07]
 543 ; X64-NEXT:    # xmm0 = mem[0,0]
 544 ; X64-NEXT:    vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06]
 545 ; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 546 ; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
 547 ; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 548 ; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
 549 ; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 550 ; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
 551 ; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 552 ; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
 553 ; X64-NEXT:    retq # encoding: [0xc3]
 554 entry:
 555   %0 = load i64, ptr %a, align 8
 556   %vecinit.i.i = insertelement <2 x i64> undef, i64 %0, i32 0
 557   %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
 558   %1 = load i64, ptr %b, align 8
 559   %vecinit.i.i2 = insertelement <2 x i64> undef, i64 %1, i32 0
 560   %vecinit1.i.i3 = shufflevector <2 x i64> %vecinit.i.i2, <2 x i64> undef, <2 x i32> zeroinitializer
 561   %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %vecinit1.i.i, <2 x i64> %vecinit1.i.i3)
 562   %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
 563   %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 564   %5 = bitcast <8 x i1> %4 to i8
 565   store i8 %5, ptr %m0, align 1
 566   %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
 567   %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
 568   %8 = bitcast <8 x i1> %7 to i8
 569   store i8 %8, ptr %m1, align 1
 570   ret void
 571 }
 572
 573 declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
 574 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
 575 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
 576 declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)