llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X86
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X64
   4
   5
   6 declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
   7
   8 define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
   9 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
  10 ; X86:       # %bb.0:
  11 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
  12 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
  13 ; X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x18]
  14 ; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
  15 ; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
  16 ; X86-NEXT:    retl # encoding: [0xc3]
  17 ;
  18 ; X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
  19 ; X64:       # %bb.0:
  20 ; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
  21 ; X64-NEXT:    vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x1f]
  22 ; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
  23 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
  24 ; X64-NEXT:    retq # encoding: [0xc3]
  25   %x2 = load <4 x i32>, ptr %x2p
  26   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
  27   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
  28   %res = add <4 x i32> %1, %2
  29   ret <4 x i32> %res
  30 }
  31
  32 declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
  33
  34 define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
  35 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
  36 ; X86:       # %bb.0:
  37 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
  38 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
  39 ; X86-NEXT:    vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x18]
  40 ; X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
  41 ; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
  42 ; X86-NEXT:    retl # encoding: [0xc3]
  43 ;
  44 ; X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
  45 ; X64:       # %bb.0:
  46 ; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
  47 ; X64-NEXT:    vpdpbssds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x1f]
  48 ; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
  49 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
  50 ; X64-NEXT:    retq # encoding: [0xc3]
  51   %x2 = load <4 x i32>, ptr %x2p
  52   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
  53   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
  54   %res = add <4 x i32> %1, %2
  55   ret <4 x i32> %res
  56 }
  57
  58 declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
  59
  60 define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
  61 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
  62 ; X86:       # %bb.0:
  63 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
  64 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
  65 ; X86-NEXT:    vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x18]
  66 ; X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
  67 ; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
  68 ; X86-NEXT:    retl # encoding: [0xc3]
  69 ;
  70 ; X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
  71 ; X64:       # %bb.0:
  72 ; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
  73 ; X64-NEXT:    vpdpbssd (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x1f]
  74 ; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
  75 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
  76 ; X64-NEXT:    retq # encoding: [0xc3]
  77   %x2 = load <8 x i32>, ptr %x2p
  78   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
  79   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
  80   %res = add <8 x i32> %1, %2
  81   ret <8 x i32> %res
  82 }
  83
  84 declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
  85
  86 define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
  87 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
  88 ; X86:       # %bb.0:
  89 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
  90 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
  91 ; X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x18]
  92 ; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
  93 ; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
  94 ; X86-NEXT:    retl # encoding: [0xc3]
  95 ;
  96 ; X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
  97 ; X64:       # %bb.0:
  98 ; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
  99 ; X64-NEXT:    vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x1f]
 100 ; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
 101 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 102 ; X64-NEXT:    retq # encoding: [0xc3]
 103   %x2 = load <8 x i32>, ptr %x2p
 104   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
 105   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
 106   %res = add <8 x i32> %1, %2
 107   ret <8 x i32> %res
 108 }
 109
 110 declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
 111
 112 define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
 113 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
 114 ; X86:       # %bb.0:
 115 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 116 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 117 ; X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x18]
 118 ; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
 119 ; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 120 ; X86-NEXT:    retl # encoding: [0xc3]
 121 ;
 122 ; X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
 123 ; X64:       # %bb.0:
 124 ; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 125 ; X64-NEXT:    vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x1f]
 126 ; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
 127 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 128 ; X64-NEXT:    retq # encoding: [0xc3]
 129   %x2 = load <4 x i32>, ptr %x2p
 130   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
 131   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
 132   %res = add <4 x i32> %1, %2
 133   ret <4 x i32> %res
 134 }
 135
 136 declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
 137
 138 define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
 139 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
 140 ; X86:       # %bb.0:
 141 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 142 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 143 ; X86-NEXT:    vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x18]
 144 ; X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
 145 ; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 146 ; X86-NEXT:    retl # encoding: [0xc3]
 147 ;
 148 ; X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
 149 ; X64:       # %bb.0:
 150 ; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 151 ; X64-NEXT:    vpdpbsuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x1f]
 152 ; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
 153 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 154 ; X64-NEXT:    retq # encoding: [0xc3]
 155   %x2 = load <4 x i32>, ptr %x2p
 156   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
 157   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
 158   %res = add <4 x i32> %1, %2
 159   ret <4 x i32> %res
 160 }
 161
 162 declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
 163
 164 define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
 165 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
 166 ; X86:       # %bb.0:
 167 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 168 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 169 ; X86-NEXT:    vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x18]
 170 ; X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
 171 ; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 172 ; X86-NEXT:    retl # encoding: [0xc3]
 173 ;
 174 ; X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
 175 ; X64:       # %bb.0:
 176 ; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 177 ; X64-NEXT:    vpdpbsud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x1f]
 178 ; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
 179 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 180 ; X64-NEXT:    retq # encoding: [0xc3]
 181   %x2 = load <8 x i32>, ptr %x2p
 182   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
 183   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
 184   %res = add <8 x i32> %1, %2
 185   ret <8 x i32> %res
 186 }
 187
 188 declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
 189
 190 define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
 191 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
 192 ; X86:       # %bb.0:
 193 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 194 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 195 ; X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x18]
 196 ; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
 197 ; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 198 ; X86-NEXT:    retl # encoding: [0xc3]
 199 ;
 200 ; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
 201 ; X64:       # %bb.0:
 202 ; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 203 ; X64-NEXT:    vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x1f]
 204 ; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
 205 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 206 ; X64-NEXT:    retq # encoding: [0xc3]
 207   %x2 = load <8 x i32>, ptr %x2p
 208   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
 209   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
 210   %res = add <8 x i32> %1, %2
 211   ret <8 x i32> %res
 212 }
 213
 214 declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
 215
 216 define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
 217 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
 218 ; X86:       # %bb.0:
 219 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 220 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 221 ; X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x18]
 222 ; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
 223 ; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 224 ; X86-NEXT:    retl # encoding: [0xc3]
 225 ;
 226 ; X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
 227 ; X64:       # %bb.0:
 228 ; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 229 ; X64-NEXT:    vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x1f]
 230 ; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
 231 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 232 ; X64-NEXT:    retq # encoding: [0xc3]
 233   %x2 = load <4 x i32>, ptr %x2p
 234   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
 235   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
 236   %res = add <4 x i32> %1, %2
 237   ret <4 x i32> %res
 238 }
 239
 240 declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
 241
 242 define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
 243 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
 244 ; X86:       # %bb.0:
 245 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 246 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 247 ; X86-NEXT:    vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x18]
 248 ; X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
 249 ; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 250 ; X86-NEXT:    retl # encoding: [0xc3]
 251 ;
 252 ; X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
 253 ; X64:       # %bb.0:
 254 ; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
 255 ; X64-NEXT:    vpdpbuuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x1f]
 256 ; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
 257 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 258 ; X64-NEXT:    retq # encoding: [0xc3]
 259   %x2 = load <4 x i32>, ptr %x2p
 260   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
 261   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
 262   %res = add <4 x i32> %1, %2
 263   ret <4 x i32> %res
 264 }
 265
 266 declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
 267
 268 define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
 269 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
 270 ; X86:       # %bb.0:
 271 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 272 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 273 ; X86-NEXT:    vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x18]
 274 ; X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
 275 ; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 276 ; X86-NEXT:    retl # encoding: [0xc3]
 277 ;
 278 ; X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
 279 ; X64:       # %bb.0:
 280 ; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 281 ; X64-NEXT:    vpdpbuud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x1f]
 282 ; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
 283 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 284 ; X64-NEXT:    retq # encoding: [0xc3]
 285   %x2 = load <8 x i32>, ptr %x2p
 286   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
 287   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
 288   %res = add <8 x i32> %1, %2
 289   ret <8 x i32> %res
 290 }
 291
 292 declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
 293
 294 define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
 295 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
 296 ; X86:       # %bb.0:
 297 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 298 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 299 ; X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x18]
 300 ; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
 301 ; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 302 ; X86-NEXT:    retl # encoding: [0xc3]
 303 ;
 304 ; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
 305 ; X64:       # %bb.0:
 306 ; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
 307 ; X64-NEXT:    vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x1f]
 308 ; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
 309 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 310 ; X64-NEXT:    retq # encoding: [0xc3]
 311   %x2 = load <8 x i32>, ptr %x2p
 312   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
 313   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
 314   %res = add <8 x i32> %1, %2
 315   ret <8 x i32> %res
 316 }