test/CodeGen/X86/avx2-intrinsics-fast-isel.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
   3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
   4
   5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
   6
   7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
   8 ; CHECK-LABEL: test_mm256_abs_epi8:
   9 ; CHECK:       # %bb.0:
  10 ; CHECK-NEXT:    vpabsb %ymm0, %ymm0
  11 ; CHECK-NEXT:    ret{{[l|q]}}
  12   %arg = bitcast <4 x i64> %a0 to <32 x i8>
  13   %sub = sub <32 x i8> zeroinitializer, %arg
  14   %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
  15   %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
  16   %res = bitcast <32 x i8> %sel to <4 x i64>
  17   ret <4 x i64> %res
  18 }
  19 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
  20
  21 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
  22 ; CHECK-LABEL: test_mm256_abs_epi16:
  23 ; CHECK:       # %bb.0:
  24 ; CHECK-NEXT:    vpabsw %ymm0, %ymm0
  25 ; CHECK-NEXT:    ret{{[l|q]}}
  26   %arg = bitcast <4 x i64> %a0 to <16 x i16>
  27   %sub = sub <16 x i16> zeroinitializer, %arg
  28   %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
  29   %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
  30   %res = bitcast <16 x i16> %sel to <4 x i64>
  31   ret <4 x i64> %res
  32 }
  33 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
  34
  35 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
  36 ; CHECK-LABEL: test_mm256_abs_epi32:
  37 ; CHECK:       # %bb.0:
  38 ; CHECK-NEXT:    vpabsd %ymm0, %ymm0
  39 ; CHECK-NEXT:    ret{{[l|q]}}
  40   %arg = bitcast <4 x i64> %a0 to <8 x i32>
  41   %sub = sub <8 x i32> zeroinitializer, %arg
  42   %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
  43   %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
  44   %res = bitcast <8 x i32> %sel to <4 x i64>
  45   ret <4 x i64> %res
  46 }
  47 declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
  48
  49 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  50 ; CHECK-LABEL: test_mm256_add_epi8:
  51 ; CHECK:       # %bb.0:
  52 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
  53 ; CHECK-NEXT:    ret{{[l|q]}}
  54   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
  55   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
  56   %res = add <32 x i8> %arg0, %arg1
  57   %bc = bitcast <32 x i8> %res to <4 x i64>
  58   ret <4 x i64> %bc
  59 }
  60
  61 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  62 ; CHECK-LABEL: test_mm256_add_epi16:
  63 ; CHECK:       # %bb.0:
  64 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
  65 ; CHECK-NEXT:    ret{{[l|q]}}
  66   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
  67   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
  68   %res = add <16 x i16> %arg0, %arg1
  69   %bc = bitcast <16 x i16> %res to <4 x i64>
  70   ret <4 x i64> %bc
  71 }
  72
  73 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  74 ; CHECK-LABEL: test_mm256_add_epi32:
  75 ; CHECK:       # %bb.0:
  76 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
  77 ; CHECK-NEXT:    ret{{[l|q]}}
  78   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
  79   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
  80   %res = add <8 x i32> %arg0, %arg1
  81   %bc = bitcast <8 x i32> %res to <4 x i64>
  82   ret <4 x i64> %bc
  83 }
  84
  85 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  86 ; CHECK-LABEL: test_mm256_add_epi64:
  87 ; CHECK:       # %bb.0:
  88 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
  89 ; CHECK-NEXT:    ret{{[l|q]}}
  90   %res = add <4 x i64> %a0, %a1
  91   ret <4 x i64> %res
  92 }
  93
  94 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
  95 ; CHECK-LABEL: test_mm256_adds_epi8:
  96 ; CHECK:       # %bb.0:
  97 ; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
  98 ; CHECK-NEXT:    ret{{[l|q]}}
  99   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 100   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 101   %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
 102   %bc = bitcast <32 x i8> %res to <4 x i64>
 103   ret <4 x i64> %bc
 104 }
 105 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
 106
 107 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 108 ; CHECK-LABEL: test_mm256_adds_epi16:
 109 ; CHECK:       # %bb.0:
 110 ; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
 111 ; CHECK-NEXT:    ret{{[l|q]}}
 112   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 113   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 114   %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 115   %bc = bitcast <16 x i16> %res to <4 x i64>
 116   ret <4 x i64> %bc
 117 }
 118 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
 119
 120 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
 121 ; CHECK-LABEL: test_mm256_adds_epu8:
 122 ; CHECK:       # %bb.0:
 123 ; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
 124 ; CHECK-NEXT:    ret{{[l|q]}}
 125   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 126   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 127   %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
 128   %bc = bitcast <32 x i8> %res to <4 x i64>
 129   ret <4 x i64> %bc
 130 }
 131 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
 132
 133 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
 134 ; CHECK-LABEL: test_mm256_adds_epu16:
 135 ; CHECK:       # %bb.0:
 136 ; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
 137 ; CHECK-NEXT:    ret{{[l|q]}}
 138   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 139   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 140   %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 141   %bc = bitcast <16 x i16> %res to <4 x i64>
 142   ret <4 x i64> %bc
 143 }
 144 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
 145
 146 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 147 ; CHECK-LABEL: test_mm256_alignr_epi8:
 148 ; CHECK:       # %bb.0:
 149 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
 150 ; CHECK-NEXT:    ret{{[l|q]}}
 151   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 152   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 153   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
 154   %res = bitcast <32 x i8> %shuf to <4 x i64>
 155   ret <4 x i64> %res
 156 }
 157
 158 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 159 ; CHECK-LABEL: test2_mm256_alignr_epi8:
 160 ; CHECK:       # %bb.0:
 161 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
 162 ; CHECK-NEXT:    ret{{[l|q]}}
 163   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 164   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 165   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
 166   %res = bitcast <32 x i8> %shuf to <4 x i64>
 167   ret <4 x i64> %res
 168 }
 169
 170 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 171 ; CHECK-LABEL: test_mm256_and_si256:
 172 ; CHECK:       # %bb.0:
 173 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
 174 ; CHECK-NEXT:    ret{{[l|q]}}
 175   %res = and <4 x i64> %a0, %a1
 176   ret <4 x i64> %res
 177 }
 178
 179 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 180 ; CHECK-LABEL: test_mm256_andnot_si256:
 181 ; CHECK:       # %bb.0:
 182 ; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 183 ; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 184 ; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
 185 ; CHECK-NEXT:    ret{{[l|q]}}
 186   %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
 187   %res = and <4 x i64> %not, %a1
 188   ret <4 x i64> %res
 189 }
 190
 191 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 192 ; CHECK-LABEL: test_mm256_avg_epu8:
 193 ; CHECK:       # %bb.0:
 194 ; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
 195 ; CHECK-NEXT:    ret{{[l|q]}}
 196   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 197   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 198   %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
 199   %bc = bitcast <32 x i8> %res to <4 x i64>
 200   ret <4 x i64> %bc
 201 }
 202 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 203
 204 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 205 ; CHECK-LABEL: test_mm256_avg_epu16:
 206 ; CHECK:       # %bb.0:
 207 ; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
 208 ; CHECK-NEXT:    ret{{[l|q]}}
 209   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 210   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 211   %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
 212   %bc = bitcast <16 x i16> %res to <4 x i64>
 213   ret <4 x i64> %bc
 214 }
 215 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
 216
 217 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 218 ; CHECK-LABEL: test_mm256_blend_epi16:
 219 ; CHECK:       # %bb.0:
 220 ; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
 221 ; CHECK-NEXT:    ret{{[l|q]}}
 222   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 223   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 224   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 225   %res = bitcast <16 x i16> %shuf to <4 x i64>
 226   ret <4 x i64> %res
 227 }
 228
 229 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 230 ; CHECK-LABEL: test_mm_blend_epi32:
 231 ; CHECK:       # %bb.0:
 232 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 233 ; CHECK-NEXT:    ret{{[l|q]}}
 234   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 235   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 236   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
 237   %res = bitcast <4 x i32> %shuf to <2 x i64>
 238   ret <2 x i64> %res
 239 }
 240
 241 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 242 ; CHECK-LABEL: test_mm256_blend_epi32:
 243 ; CHECK:       # %bb.0:
 244 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
 245 ; CHECK-NEXT:    ret{{[l|q]}}
 246   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 247   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 248   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
 249   %res = bitcast <8 x i32> %shuf to <4 x i64>
 250   ret <4 x i64> %res
 251 }
 252
 253 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 254 ; CHECK-LABEL: test_mm256_blendv_epi8:
 255 ; CHECK:       # %bb.0:
 256 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 257 ; CHECK-NEXT:    ret{{[l|q]}}
 258   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 259   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 260   %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
 261   %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
 262   %res = bitcast <32 x i8> %call to <4 x i64>
 263   ret <4 x i64> %res
 264 }
 265 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
 266
 267 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
 268 ; CHECK-LABEL: test_mm_broadcastb_epi8:
 269 ; CHECK:       # %bb.0:
 270 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
 271 ; CHECK-NEXT:    ret{{[l|q]}}
 272   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 273   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
 274   %res = bitcast <16 x i8> %shuf to <2 x i64>
 275   ret <2 x i64> %res
 276 }
 277
 278 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
 279 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
 280 ; CHECK:       # %bb.0:
 281 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
 282 ; CHECK-NEXT:    ret{{[l|q]}}
 283   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 284   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
 285   %res = bitcast <32 x i8> %shuf to <4 x i64>
 286   ret <4 x i64> %res
 287 }
 288
 289 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
 290 ; CHECK-LABEL: test_mm_broadcastd_epi32:
 291 ; CHECK:       # %bb.0:
 292 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 293 ; CHECK-NEXT:    ret{{[l|q]}}
 294   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 295   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
 296   %res = bitcast <4 x i32> %shuf to <2 x i64>
 297   ret <2 x i64> %res
 298 }
 299
 300 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
 301 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
 302 ; CHECK:       # %bb.0:
 303 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 304 ; CHECK-NEXT:    ret{{[l|q]}}
 305   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 306   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
 307   %res = bitcast <8 x i32> %shuf to <4 x i64>
 308   ret <4 x i64> %res
 309 }
 310
 311 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
 312 ; CHECK-LABEL: test_mm_broadcastq_epi64:
 313 ; CHECK:       # %bb.0:
 314 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 315 ; CHECK-NEXT:    ret{{[l|q]}}
 316   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
 317   ret <2 x i64> %res
 318 }
 319
 320 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
 321 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
 322 ; CHECK:       # %bb.0:
 323 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 324 ; CHECK-NEXT:    ret{{[l|q]}}
 325   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
 326   ret <4 x i64> %res
 327 }
 328
 329 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
 330 ; CHECK-LABEL: test_mm_broadcastsd_pd:
 331 ; CHECK:       # %bb.0:
 332 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 333 ; CHECK-NEXT:    ret{{[l|q]}}
 334   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
 335   ret <2 x double> %res
 336 }
 337
 338 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
 339 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
 340 ; CHECK:       # %bb.0:
 341 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 342 ; CHECK-NEXT:    ret{{[l|q]}}
 343   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
 344   ret <4 x double> %res
 345 }
 346
 347 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
 348 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
 349 ; CHECK:       # %bb.0:
 350 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 351 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 352 ; CHECK-NEXT:    ret{{[l|q]}}
 353   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 354   ret <4 x i64> %res
 355 }
 356
 357 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
 358 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
 359 ; X86:       # %bb.0:
 360 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 361 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 362 ; X86-NEXT:    retl
 363 ;
 364 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
 365 ; X64:       # %bb.0:
 366 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 367 ; X64-NEXT:    retq
 368   %a0 = load <2 x i64>, <2 x i64>* %p0
 369   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 370   ret <4 x i64> %res
 371 }
 372
 373 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
 374 ; CHECK-LABEL: test_mm_broadcastss_ps:
 375 ; CHECK:       # %bb.0:
 376 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 377 ; CHECK-NEXT:    ret{{[l|q]}}
 378   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
 379   ret <4 x float> %res
 380 }
 381
 382 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
 383 ; CHECK-LABEL: test_mm256_broadcastss_ps:
 384 ; CHECK:       # %bb.0:
 385 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 386 ; CHECK-NEXT:    ret{{[l|q]}}
 387   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
 388   ret <8 x float> %res
 389 }
 390
 391 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
 392 ; CHECK-LABEL: test_mm_broadcastw_epi16:
 393 ; CHECK:       # %bb.0:
 394 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
 395 ; CHECK-NEXT:    ret{{[l|q]}}
 396   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 397   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
 398   %res = bitcast <8 x i16> %shuf to <2 x i64>
 399   ret <2 x i64> %res
 400 }
 401
 402 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
 403 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
 404 ; CHECK:       # %bb.0:
 405 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
 406 ; CHECK-NEXT:    ret{{[l|q]}}
 407   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 408   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
 409   %res = bitcast <16 x i16> %shuf to <4 x i64>
 410   ret <4 x i64> %res
 411 }
 412
 413 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
 414 ; CHECK-LABEL: test_mm256_bslli_epi128:
 415 ; CHECK:       # %bb.0:
 416 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
 417 ; CHECK-NEXT:    ret{{[l|q]}}
 418   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 419   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
 420   %res = bitcast <32 x i8> %shuf to <4 x i64>
 421   ret <4 x i64> %res
 422 }
 423
 424 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
 425 ; CHECK-LABEL: test_mm256_bsrli_epi128:
 426 ; CHECK:       # %bb.0:
 427 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
 428 ; CHECK-NEXT:    ret{{[l|q]}}
 429   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 430   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
 431   %res = bitcast <32 x i8> %shuf to <4 x i64>
 432   ret <4 x i64> %res
 433 }
 434
 435 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 436 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
 437 ; CHECK:       # %bb.0:
 438 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 439 ; CHECK-NEXT:    ret{{[l|q]}}
 440   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 441   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 442   %cmp = icmp eq <32 x i8> %arg0, %arg1
 443   %res = sext <32 x i1> %cmp to <32 x i8>
 444   %bc = bitcast <32 x i8> %res to <4 x i64>
 445   ret <4 x i64> %bc
 446 }
 447
 448 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 449 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
 450 ; CHECK:       # %bb.0:
 451 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 452 ; CHECK-NEXT:    ret{{[l|q]}}
 453   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 454   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 455   %cmp = icmp eq <16 x i16> %arg0, %arg1
 456   %res = sext <16 x i1> %cmp to <16 x i16>
 457   %bc = bitcast <16 x i16> %res to <4 x i64>
 458   ret <4 x i64> %bc
 459 }
 460
 461 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 462 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
 463 ; CHECK:       # %bb.0:
 464 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
 465 ; CHECK-NEXT:    ret{{[l|q]}}
 466   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 467   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 468   %cmp = icmp eq <8 x i32> %arg0, %arg1
 469   %res = sext <8 x i1> %cmp to <8 x i32>
 470   %bc = bitcast <8 x i32> %res to <4 x i64>
 471   ret <4 x i64> %bc
 472 }
 473
 474 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 475 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
 476 ; CHECK:       # %bb.0:
 477 ; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 478 ; CHECK-NEXT:    ret{{[l|q]}}
 479   %cmp = icmp eq <4 x i64> %a0, %a1
 480   %res = sext <4 x i1> %cmp to <4 x i64>
 481   ret <4 x i64> %res
 482 }
 483
 484 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 485 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
 486 ; CHECK:       # %bb.0:
 487 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 488 ; CHECK-NEXT:    ret{{[l|q]}}
 489   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 490   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 491   %cmp = icmp sgt <32 x i8> %arg0, %arg1
 492   %res = sext <32 x i1> %cmp to <32 x i8>
 493   %bc = bitcast <32 x i8> %res to <4 x i64>
 494   ret <4 x i64> %bc
 495 }
 496
 497 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 498 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
 499 ; CHECK:       # %bb.0:
 500 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 501 ; CHECK-NEXT:    ret{{[l|q]}}
 502   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 503   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 504   %cmp = icmp sgt <16 x i16> %arg0, %arg1
 505   %res = sext <16 x i1> %cmp to <16 x i16>
 506   %bc = bitcast <16 x i16> %res to <4 x i64>
 507   ret <4 x i64> %bc
 508 }
 509
 510 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 511 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
 512 ; CHECK:       # %bb.0:
 513 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 514 ; CHECK-NEXT:    ret{{[l|q]}}
 515   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 516   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 517   %cmp = icmp sgt <8 x i32> %arg0, %arg1
 518   %res = sext <8 x i1> %cmp to <8 x i32>
 519   %bc = bitcast <8 x i32> %res to <4 x i64>
 520   ret <4 x i64> %bc
 521 }
 522
 523 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 524 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
 525 ; CHECK:       # %bb.0:
 526 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 527 ; CHECK-NEXT:    ret{{[l|q]}}
 528   %cmp = icmp sgt <4 x i64> %a0, %a1
 529   %res = sext <4 x i1> %cmp to <4 x i64>
 530   ret <4 x i64> %res
 531 }
 532
 533 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
 534 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
 535 ; CHECK:       # %bb.0:
 536 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
 537 ; CHECK-NEXT:    ret{{[l|q]}}
 538   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 539   %ext = sext <16 x i8> %arg0 to <16 x i16>
 540   %res = bitcast <16 x i16> %ext to <4 x i64>
 541   ret <4 x i64> %res
 542 }
 543
 544 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
 545 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
 546 ; CHECK:       # %bb.0:
 547 ; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
 548 ; CHECK-NEXT:    ret{{[l|q]}}
 549   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 550   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 551   %ext = sext <8 x i8> %shuf to <8 x i32>
 552   %res = bitcast <8 x i32> %ext to <4 x i64>
 553   ret <4 x i64> %res
 554 }
 555
 556 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
 557 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
 558 ; CHECK:       # %bb.0:
 559 ; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
 560 ; CHECK-NEXT:    ret{{[l|q]}}
 561   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 562   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 563   %ext = sext <4 x i8> %shuf to <4 x i64>
 564   ret <4 x i64> %ext
 565 }
 566
 567 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
 568 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
 569 ; CHECK:       # %bb.0:
 570 ; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
 571 ; CHECK-NEXT:    ret{{[l|q]}}
 572   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 573   %ext = sext <8 x i16> %arg0 to <8 x i32>
 574   %res = bitcast <8 x i32> %ext to <4 x i64>
 575   ret <4 x i64> %res
 576 }
 577
 578 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
 579 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
 580 ; CHECK:       # %bb.0:
 581 ; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
 582 ; CHECK-NEXT:    ret{{[l|q]}}
 583   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 584   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 585   %ext = sext <4 x i16> %shuf to <4 x i64>
 586   ret <4 x i64> %ext
 587 }
 588
 589 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
 590 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
 591 ; CHECK:       # %bb.0:
 592 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
 593 ; CHECK-NEXT:    ret{{[l|q]}}
 594   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 595   %ext = sext <4 x i32> %arg0 to <4 x i64>
 596   ret <4 x i64> %ext
 597 }
 598
 599 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
 600 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
 601 ; CHECK:       # %bb.0:
 602 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 603 ; CHECK-NEXT:    ret{{[l|q]}}
 604   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 605   %ext = zext <16 x i8> %arg0 to <16 x i16>
 606   %res = bitcast <16 x i16> %ext to <4 x i64>
 607   ret <4 x i64> %res
 608 }
 609
 610 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
 611 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
 612 ; CHECK:       # %bb.0:
 613 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 614 ; CHECK-NEXT:    ret{{[l|q]}}
 615   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 616   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 617   %ext = zext <8 x i8> %shuf to <8 x i32>
 618   %res = bitcast <8 x i32> %ext to <4 x i64>
 619   ret <4 x i64> %res
 620 }
 621
 622 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
 623 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
 624 ; CHECK:       # %bb.0:
 625 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
 626 ; CHECK-NEXT:    ret{{[l|q]}}
 627   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 628   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 629   %ext = zext <4 x i8> %shuf to <4 x i64>
 630   ret <4 x i64> %ext
 631 }
 632
 633 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
 634 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
 635 ; CHECK:       # %bb.0:
 636 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 637 ; CHECK-NEXT:    ret{{[l|q]}}
 638   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 639   %ext = zext <8 x i16> %arg0 to <8 x i32>
 640   %res = bitcast <8 x i32> %ext to <4 x i64>
 641   ret <4 x i64> %res
 642 }
 643
 644 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
 645 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
 646 ; CHECK:       # %bb.0:
 647 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 648 ; CHECK-NEXT:    ret{{[l|q]}}
 649   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 650   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 651   %ext = zext <4 x i16> %shuf to <4 x i64>
 652   ret <4 x i64> %ext
 653 }
 654
 655 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
 656 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
 657 ; CHECK:       # %bb.0:
 658 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 659 ; CHECK-NEXT:    ret{{[l|q]}}
 660   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 661   %ext = zext <4 x i32> %arg0 to <4 x i64>
 662   ret <4 x i64> %ext
 663 }
 664
 665 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
 666 ; CHECK-LABEL: test_mm256_extracti128_si256:
 667 ; CHECK:       # %bb.0:
 668 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 669 ; CHECK-NEXT:    vzeroupper
 670 ; CHECK-NEXT:    ret{{[l|q]}}
 671   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
 672   ret <2 x i64> %res
 673 }
 674
 675 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 676 ; CHECK-LABEL: test_mm256_hadd_epi16:
 677 ; CHECK:       # %bb.0:
 678 ; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 679 ; CHECK-NEXT:    ret{{[l|q]}}
 680   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 681   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 682   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
 683   %bc = bitcast <16 x i16> %res to <4 x i64>
 684   ret <4 x i64> %bc
 685 }
 686 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
 687
 688 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 689 ; CHECK-LABEL: test_mm256_hadd_epi32:
 690 ; CHECK:       # %bb.0:
 691 ; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 692 ; CHECK-NEXT:    ret{{[l|q]}}
 693   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 694   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 695   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
 696   %bc = bitcast <8 x i32> %res to <4 x i64>
 697   ret <4 x i64> %bc
 698 }
 699 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 700
 701 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 702 ; CHECK-LABEL: test_mm256_hadds_epi16:
 703 ; CHECK:       # %bb.0:
 704 ; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
 705 ; CHECK-NEXT:    ret{{[l|q]}}
 706   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 707   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 708   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 709   %bc = bitcast <16 x i16> %res to <4 x i64>
 710   ret <4 x i64> %bc
 711 }
 712 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
 713
 714 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 715 ; CHECK-LABEL: test_mm256_hsub_epi16:
 716 ; CHECK:       # %bb.0:
 717 ; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
 718 ; CHECK-NEXT:    ret{{[l|q]}}
 719   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 720   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 721   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
 722   %bc = bitcast <16 x i16> %res to <4 x i64>
 723   ret <4 x i64> %bc
 724 }
 725 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
 726
 727 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 728 ; CHECK-LABEL: test_mm256_hsub_epi32:
 729 ; CHECK:       # %bb.0:
 730 ; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
 731 ; CHECK-NEXT:    ret{{[l|q]}}
 732   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 733   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 734   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
 735   %bc = bitcast <8 x i32> %res to <4 x i64>
 736   ret <4 x i64> %bc
 737 }
 738 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 739
 740 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 741 ; CHECK-LABEL: test_mm256_hsubs_epi16:
 742 ; CHECK:       # %bb.0:
 743 ; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
 744 ; CHECK-NEXT:    ret{{[l|q]}}
 745   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 746   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 747   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 748   %bc = bitcast <16 x i16> %res to <4 x i64>
 749   ret <4 x i64> %bc
 750 }
 751 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
 752
 753 define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
 754 ; X86-LABEL: test_mm_i32gather_epi32:
 755 ; X86:       # %bb.0:
 756 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 757 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 758 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 759 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
 760 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 761 ; X86-NEXT:    retl
 762 ;
 763 ; X64-LABEL: test_mm_i32gather_epi32:
 764 ; X64:       # %bb.0:
 765 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 766 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 767 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
 768 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 769 ; X64-NEXT:    retq
 770   %arg0 = bitcast i32 *%a0 to i8*
 771   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 772   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
 773   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
 774   %bc = bitcast <4 x i32> %call to <2 x i64>
 775   ret <2 x i64> %bc
 776 }
 777 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
 778
 779 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
 780 ; X86-LABEL: test_mm_mask_i32gather_epi32:
 781 ; X86:       # %bb.0:
 782 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 783 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
 784 ; X86-NEXT:    retl
 785 ;
 786 ; X64-LABEL: test_mm_mask_i32gather_epi32:
 787 ; X64:       # %bb.0:
 788 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
 789 ; X64-NEXT:    retq
 790   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 791   %arg1 = bitcast i32 *%a1 to i8*
 792   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 793   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
 794   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
 795   %bc = bitcast <4 x i32> %call to <2 x i64>
 796   ret <2 x i64> %bc
 797 }
 798
 799 define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
 800 ; X86-LABEL: test_mm256_i32gather_epi32:
 801 ; X86:       # %bb.0:
 802 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 803 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 804 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 805 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
 806 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 807 ; X86-NEXT:    retl
 808 ;
 809 ; X64-LABEL: test_mm256_i32gather_epi32:
 810 ; X64:       # %bb.0:
 811 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 812 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 813 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
 814 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 815 ; X64-NEXT:    retq
 816   %arg0 = bitcast i32 *%a0 to i8*
 817   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 818   %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
 819   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
 820   %bc = bitcast <8 x i32> %call to <4 x i64>
 821   ret <4 x i64> %bc
 822 }
 823 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
 824
 825 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
 826 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
 827 ; X86:       # %bb.0:
 828 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 829 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
 830 ; X86-NEXT:    retl
 831 ;
 832 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
 833 ; X64:       # %bb.0:
 834 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
 835 ; X64-NEXT:    retq
 836   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 837   %arg1 = bitcast i32 *%a1 to i8*
 838   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
 839   %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
 840   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
 841   %bc = bitcast <8 x i32> %call to <4 x i64>
 842   ret <4 x i64> %bc
 843 }
 844
 845 define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 846 ; X86-LABEL: test_mm_i32gather_epi64:
 847 ; X86:       # %bb.0:
 848 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 849 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 850 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 851 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
 852 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 853 ; X86-NEXT:    retl
 854 ;
 855 ; X64-LABEL: test_mm_i32gather_epi64:
 856 ; X64:       # %bb.0:
 857 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 858 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 859 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
 860 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 861 ; X64-NEXT:    retq
 862   %arg0 = bitcast i64 *%a0 to i8*
 863   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 864   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
 865   ret <2 x i64> %res
 866 }
 867 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
 868
 869 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
 870 ; X86-LABEL: test_mm_mask_i32gather_epi64:
 871 ; X86:       # %bb.0:
 872 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 873 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
 874 ; X86-NEXT:    retl
 875 ;
 876 ; X64-LABEL: test_mm_mask_i32gather_epi64:
 877 ; X64:       # %bb.0:
 878 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
 879 ; X64-NEXT:    retq
 880   %arg1 = bitcast i64 *%a1 to i8*
 881   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 882   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
 883   ret <2 x i64> %res
 884 }
 885
 886 define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 887 ; X86-LABEL: test_mm256_i32gather_epi64:
 888 ; X86:       # %bb.0:
 889 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 890 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 891 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 892 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
 893 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 894 ; X86-NEXT:    retl
 895 ;
 896 ; X64-LABEL: test_mm256_i32gather_epi64:
 897 ; X64:       # %bb.0:
 898 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 899 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 900 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
 901 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 902 ; X64-NEXT:    retq
 903   %arg0 = bitcast i64 *%a0 to i8*
 904   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 905   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
 906   ret <4 x i64> %res
 907 }
 908 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
 909
 910 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
 911 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
 912 ; X86:       # %bb.0:
 913 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 914 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
 915 ; X86-NEXT:    retl
 916 ;
 917 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
 918 ; X64:       # %bb.0:
 919 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
 920 ; X64-NEXT:    retq
 921   %arg1 = bitcast i64 *%a1 to i8*
 922   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 923   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
 924   ret <4 x i64> %res
 925 }
 926
 927 define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
 928 ; X86-LABEL: test_mm_i32gather_pd:
 929 ; X86:       # %bb.0:
 930 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 931 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 932 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 933 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
 934 ; X86-NEXT:    vmovapd %xmm1, %xmm0
 935 ; X86-NEXT:    retl
 936 ;
 937 ; X64-LABEL: test_mm_i32gather_pd:
 938 ; X64:       # %bb.0:
 939 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 940 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 941 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
 942 ; X64-NEXT:    vmovapd %xmm1, %xmm0
 943 ; X64-NEXT:    retq
 944   %arg0 = bitcast double *%a0 to i8*
 945   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 946   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
 947   %sext = sext <2 x i1> %cmp to <2 x i64>
 948   %mask = bitcast <2 x i64> %sext to <2 x double>
 949   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
 950   ret <2 x double> %res
 951 }
 952 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
 953
 954 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
 955 ; X86-LABEL: test_mm_mask_i32gather_pd:
 956 ; X86:       # %bb.0:
 957 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 958 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
 959 ; X86-NEXT:    retl
 960 ;
 961 ; X64-LABEL: test_mm_mask_i32gather_pd:
 962 ; X64:       # %bb.0:
 963 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
 964 ; X64-NEXT:    retq
 965   %arg1 = bitcast double *%a1 to i8*
 966   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 967   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
 968   ret <2 x double> %res
 969 }
 970
 971 define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
 972 ; X86-LABEL: test_mm256_i32gather_pd:
 973 ; X86:       # %bb.0:
 974 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 975 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 976 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 977 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
 978 ; X86-NEXT:    vmovapd %ymm1, %ymm0
 979 ; X86-NEXT:    retl
 980 ;
 981 ; X64-LABEL: test_mm256_i32gather_pd:
 982 ; X64:       # %bb.0:
 983 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 984 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 985 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
 986 ; X64-NEXT:    vmovapd %ymm1, %ymm0
 987 ; X64-NEXT:    retq
 988   %arg0 = bitcast double *%a0 to i8*
 989   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 990   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
 991   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
 992   ret <4 x double> %res
 993 }
 994 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
 995
 996 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
 997 ; X86-LABEL: test_mm256_mask_i32gather_pd:
 998 ; X86:       # %bb.0:
 999 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1000 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
1001 ; X86-NEXT:    retl
1002 ;
1003 ; X64-LABEL: test_mm256_mask_i32gather_pd:
1004 ; X64:       # %bb.0:
1005 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1006 ; X64-NEXT:    retq
1007   %arg1 = bitcast double *%a1 to i8*
1008   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1009   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1010   ret <4 x double> %res
1011 }
1012
1013 define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1014 ; X86-LABEL: test_mm_i32gather_ps:
1015 ; X86:       # %bb.0:
1016 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1017 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1018 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1019 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1020 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1021 ; X86-NEXT:    retl
1022 ;
1023 ; X64-LABEL: test_mm_i32gather_ps:
1024 ; X64:       # %bb.0:
1025 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1026 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1027 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1028 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1029 ; X64-NEXT:    retq
1030   %arg0 = bitcast float *%a0 to i8*
1031   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1032   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1033   %sext = sext <4 x i1> %cmp to <4 x i32>
1034   %mask = bitcast <4 x i32> %sext to <4 x float>
1035   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1036   ret <4 x float> %call
1037 }
1038 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1039
1040 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1041 ; X86-LABEL: test_mm_mask_i32gather_ps:
1042 ; X86:       # %bb.0:
1043 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1044 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1045 ; X86-NEXT:    retl
1046 ;
1047 ; X64-LABEL: test_mm_mask_i32gather_ps:
1048 ; X64:       # %bb.0:
1049 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1050 ; X64-NEXT:    retq
1051   %arg1 = bitcast float *%a1 to i8*
1052   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1053   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1054   ret <4 x float> %call
1055 }
1056
1057 define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1058 ; X86-LABEL: test_mm256_i32gather_ps:
1059 ; X86:       # %bb.0:
1060 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1061 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1062 ; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1063 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1064 ; X86-NEXT:    vmovaps %ymm1, %ymm0
1065 ; X86-NEXT:    retl
1066 ;
1067 ; X64-LABEL: test_mm256_i32gather_ps:
1068 ; X64:       # %bb.0:
1069 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1070 ; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1071 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1072 ; X64-NEXT:    vmovaps %ymm1, %ymm0
1073 ; X64-NEXT:    retq
1074   %arg0 = bitcast float *%a0 to i8*
1075   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1076   %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1077   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1078   ret <8 x float> %call
1079 }
1080 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1081
1082 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1083 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1084 ; X86:       # %bb.0:
1085 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1086 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1087 ; X86-NEXT:    retl
1088 ;
1089 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1090 ; X64:       # %bb.0:
1091 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1092 ; X64-NEXT:    retq
1093   %arg1 = bitcast float *%a1 to i8*
1094   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1095   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1096   ret <8 x float> %call
1097 }
1098
1099 define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1100 ; X86-LABEL: test_mm_i64gather_epi32:
1101 ; X86:       # %bb.0:
1102 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1103 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1104 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1105 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1106 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1107 ; X86-NEXT:    retl
1108 ;
1109 ; X64-LABEL: test_mm_i64gather_epi32:
1110 ; X64:       # %bb.0:
1111 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1112 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1113 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1114 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1115 ; X64-NEXT:    retq
1116   %arg0 = bitcast i32 *%a0 to i8*
1117   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1118   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1119   %bc = bitcast <4 x i32> %call to <2 x i64>
1120   ret <2 x i64> %bc
1121 }
1122 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1123
1124 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1125 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1126 ; X86:       # %bb.0:
1127 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1128 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1129 ; X86-NEXT:    retl
1130 ;
1131 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1132 ; X64:       # %bb.0:
1133 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1134 ; X64-NEXT:    retq
1135   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1136   %arg1 = bitcast i32 *%a1 to i8*
1137   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1138   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1139   %bc = bitcast <4 x i32> %call to <2 x i64>
1140   ret <2 x i64> %bc
1141 }
1142
1143 define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1144 ; X86-LABEL: test_mm256_i64gather_epi32:
1145 ; X86:       # %bb.0:
1146 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1147 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1148 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1149 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1150 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1151 ; X86-NEXT:    vzeroupper
1152 ; X86-NEXT:    retl
1153 ;
1154 ; X64-LABEL: test_mm256_i64gather_epi32:
1155 ; X64:       # %bb.0:
1156 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1157 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1158 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1159 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1160 ; X64-NEXT:    vzeroupper
1161 ; X64-NEXT:    retq
1162   %arg0 = bitcast i32 *%a0 to i8*
1163   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1164   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1165   %bc = bitcast <4 x i32> %call to <2 x i64>
1166   ret <2 x i64> %bc
1167 }
1168 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1169
1170 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1171 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1172 ; X86:       # %bb.0:
1173 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1174 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1175 ; X86-NEXT:    vzeroupper
1176 ; X86-NEXT:    retl
1177 ;
1178 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1179 ; X64:       # %bb.0:
1180 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1181 ; X64-NEXT:    vzeroupper
1182 ; X64-NEXT:    retq
1183   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1184   %arg1 = bitcast i32 *%a1 to i8*
1185   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1186   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1187   %bc = bitcast <4 x i32> %call to <2 x i64>
1188   ret <2 x i64> %bc
1189 }
1190
1191 define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1192 ; X86-LABEL: test_mm_i64gather_epi64:
1193 ; X86:       # %bb.0:
1194 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1195 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1196 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1197 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1198 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1199 ; X86-NEXT:    retl
1200 ;
1201 ; X64-LABEL: test_mm_i64gather_epi64:
1202 ; X64:       # %bb.0:
1203 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1204 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1205 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1206 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1207 ; X64-NEXT:    retq
1208   %arg0 = bitcast i64 *%a0 to i8*
1209   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1210   ret <2 x i64> %call
1211 }
1212 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1213
1214 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1215 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1216 ; X86:       # %bb.0:
1217 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1218 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1219 ; X86-NEXT:    retl
1220 ;
1221 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1222 ; X64:       # %bb.0:
1223 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1224 ; X64-NEXT:    retq
1225   %arg1 = bitcast i64 *%a1 to i8*
1226   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1227   ret <2 x i64> %call
1228 }
1229
1230 define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1231 ; X86-LABEL: test_mm256_i64gather_epi64:
1232 ; X86:       # %bb.0:
1233 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1234 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1235 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1236 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1237 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
1238 ; X86-NEXT:    retl
1239 ;
1240 ; X64-LABEL: test_mm256_i64gather_epi64:
1241 ; X64:       # %bb.0:
1242 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1243 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1244 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1245 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
1246 ; X64-NEXT:    retq
1247   %arg0 = bitcast i64 *%a0 to i8*
1248   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1249   ret <4 x i64> %call
1250 }
1251 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1252
1253 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1254 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1255 ; X86:       # %bb.0:
1256 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1257 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1258 ; X86-NEXT:    retl
1259 ;
1260 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1261 ; X64:       # %bb.0:
1262 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1263 ; X64-NEXT:    retq
1264   %arg1 = bitcast i64 *%a1 to i8*
1265   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1266   ret <4 x i64> %call
1267 }
1268
1269 define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1270 ; X86-LABEL: test_mm_i64gather_pd:
1271 ; X86:       # %bb.0:
1272 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1273 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1274 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1275 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1276 ; X86-NEXT:    vmovapd %xmm1, %xmm0
1277 ; X86-NEXT:    retl
1278 ;
1279 ; X64-LABEL: test_mm_i64gather_pd:
1280 ; X64:       # %bb.0:
1281 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1282 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1283 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1284 ; X64-NEXT:    vmovapd %xmm1, %xmm0
1285 ; X64-NEXT:    retq
1286   %arg0 = bitcast double *%a0 to i8*
1287   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1288   %sext = sext <2 x i1> %cmp to <2 x i64>
1289   %mask = bitcast <2 x i64> %sext to <2 x double>
1290   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1291   ret <2 x double> %call
1292 }
1293 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1294
1295 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1296 ; X86-LABEL: test_mm_mask_i64gather_pd:
1297 ; X86:       # %bb.0:
1298 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1299 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1300 ; X86-NEXT:    retl
1301 ;
1302 ; X64-LABEL: test_mm_mask_i64gather_pd:
1303 ; X64:       # %bb.0:
1304 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1305 ; X64-NEXT:    retq
1306   %arg1 = bitcast double *%a1 to i8*
1307   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1308   ret <2 x double> %call
1309 }
1310
1311 define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1312 ; X86-LABEL: test_mm256_i64gather_pd:
1313 ; X86:       # %bb.0:
1314 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1315 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1316 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1317 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1318 ; X86-NEXT:    vmovapd %ymm1, %ymm0
1319 ; X86-NEXT:    retl
1320 ;
1321 ; X64-LABEL: test_mm256_i64gather_pd:
1322 ; X64:       # %bb.0:
1323 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1324 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1325 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1326 ; X64-NEXT:    vmovapd %ymm1, %ymm0
1327 ; X64-NEXT:    retq
1328   %arg0 = bitcast double *%a0 to i8*
1329   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1330   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1331   ret <4 x double> %call
1332 }
1333 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1334
1335 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1336 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1337 ; X86:       # %bb.0:
1338 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1339 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1340 ; X86-NEXT:    retl
1341 ;
1342 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1343 ; X64:       # %bb.0:
1344 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1345 ; X64-NEXT:    retq
1346   %arg1 = bitcast i64 *%a1 to i8*
1347   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1348   ret <4 x double> %call
1349 }
1350
1351 define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1352 ; X86-LABEL: test_mm_i64gather_ps:
1353 ; X86:       # %bb.0:
1354 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1355 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1356 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1357 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1358 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1359 ; X86-NEXT:    retl
1360 ;
1361 ; X64-LABEL: test_mm_i64gather_ps:
1362 ; X64:       # %bb.0:
1363 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1364 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1365 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1366 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1367 ; X64-NEXT:    retq
1368   %arg0 = bitcast float *%a0 to i8*
1369   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1370   %sext = sext <4 x i1> %cmp to <4 x i32>
1371   %mask = bitcast <4 x i32> %sext to <4 x float>
1372   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1373   ret <4 x float> %call
1374 }
1375 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1376
1377 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1378 ; X86-LABEL: test_mm_mask_i64gather_ps:
1379 ; X86:       # %bb.0:
1380 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1381 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1382 ; X86-NEXT:    retl
1383 ;
1384 ; X64-LABEL: test_mm_mask_i64gather_ps:
1385 ; X64:       # %bb.0:
1386 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1387 ; X64-NEXT:    retq
1388   %arg1 = bitcast float *%a1 to i8*
1389   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1390   ret <4 x float> %call
1391 }
1392
1393 define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1394 ; X86-LABEL: test_mm256_i64gather_ps:
1395 ; X86:       # %bb.0:
1396 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1397 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1398 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1399 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1400 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1401 ; X86-NEXT:    vzeroupper
1402 ; X86-NEXT:    retl
1403 ;
1404 ; X64-LABEL: test_mm256_i64gather_ps:
1405 ; X64:       # %bb.0:
1406 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1407 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1408 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1409 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1410 ; X64-NEXT:    vzeroupper
1411 ; X64-NEXT:    retq
1412   %arg0 = bitcast float *%a0 to i8*
1413   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1414   %sext = sext <4 x i1> %cmp to <4 x i32>
1415   %mask = bitcast <4 x i32> %sext to <4 x float>
1416   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1417   ret <4 x float> %call
1418 }
1419 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1420
1421 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1422 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1423 ; X86:       # %bb.0:
1424 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1425 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1426 ; X86-NEXT:    vzeroupper
1427 ; X86-NEXT:    retl
1428 ;
1429 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1430 ; X64:       # %bb.0:
1431 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1432 ; X64-NEXT:    vzeroupper
1433 ; X64-NEXT:    retq
1434   %arg1 = bitcast float *%a1 to i8*
1435   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1436   ret <4 x float> %call
1437 }
1438
1439 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1440 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1441 ; CHECK:       # %bb.0:
1442 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1443 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1444 ; CHECK-NEXT:    ret{{[l|q]}}
1445   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1446   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1447   ret <4 x i64> %res
1448 }
1449
1450 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1451 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1452 ; CHECK:       # %bb.0:
1453 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1454 ; CHECK-NEXT:    ret{{[l|q]}}
1455   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1456   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1457   ret <4 x i64> %res
1458 }
1459
1460 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1461 ; CHECK-LABEL: test_mm256_madd_epi16:
1462 ; CHECK:       # %bb.0:
1463 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1464 ; CHECK-NEXT:    ret{{[l|q]}}
1465   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1466   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1467   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1468   %bc = bitcast <8 x i32> %res to <4 x i64>
1469   ret <4 x i64> %bc
1470 }
1471 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1472
1473 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1474 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1475 ; CHECK:       # %bb.0:
1476 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1477 ; CHECK-NEXT:    ret{{[l|q]}}
1478   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1479   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1480   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1481   %bc = bitcast <16 x i16> %res to <4 x i64>
1482   ret <4 x i64> %bc
1483 }
1484 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1485
1486 define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1487 ; X86-LABEL: test_mm_maskload_epi32:
1488 ; X86:       # %bb.0:
1489 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1490 ; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1491 ; X86-NEXT:    retl
1492 ;
1493 ; X64-LABEL: test_mm_maskload_epi32:
1494 ; X64:       # %bb.0:
1495 ; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1496 ; X64-NEXT:    retq
1497   %arg0 = bitcast i32* %a0 to i8*
1498   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1499   %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1500   %bc = bitcast <4 x i32> %call to <2 x i64>
1501   ret <2 x i64> %bc
1502 }
1503 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1504
1505 define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1506 ; X86-LABEL: test_mm256_maskload_epi32:
1507 ; X86:       # %bb.0:
1508 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1509 ; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1510 ; X86-NEXT:    retl
1511 ;
1512 ; X64-LABEL: test_mm256_maskload_epi32:
1513 ; X64:       # %bb.0:
1514 ; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1515 ; X64-NEXT:    retq
1516   %arg0 = bitcast i32* %a0 to i8*
1517   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1518   %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1519   %bc = bitcast <8 x i32> %call to <4 x i64>
1520   ret <4 x i64> %bc
1521 }
1522 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1523
1524 define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1525 ; X86-LABEL: test_mm_maskload_epi64:
1526 ; X86:       # %bb.0:
1527 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1528 ; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1529 ; X86-NEXT:    retl
1530 ;
1531 ; X64-LABEL: test_mm_maskload_epi64:
1532 ; X64:       # %bb.0:
1533 ; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1534 ; X64-NEXT:    retq
1535   %arg0 = bitcast i64* %a0 to i8*
1536   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1537   ret <2 x i64> %res
1538 }
1539 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1540
1541 define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1542 ; X86-LABEL: test_mm256_maskload_epi64:
1543 ; X86:       # %bb.0:
1544 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1545 ; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1546 ; X86-NEXT:    retl
1547 ;
1548 ; X64-LABEL: test_mm256_maskload_epi64:
1549 ; X64:       # %bb.0:
1550 ; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1551 ; X64-NEXT:    retq
1552   %arg0 = bitcast i64* %a0 to i8*
1553   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1554   ret <4 x i64> %res
1555 }
1556 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1557
1558 define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1559 ; X86-LABEL: test_mm_maskstore_epi32:
1560 ; X86:       # %bb.0:
1561 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1562 ; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1563 ; X86-NEXT:    retl
1564 ;
1565 ; X64-LABEL: test_mm_maskstore_epi32:
1566 ; X64:       # %bb.0:
1567 ; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1568 ; X64-NEXT:    retq
1569   %arg0 = bitcast float* %a0 to i8*
1570   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1571   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1572   call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1573   ret void
1574 }
1575 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1576
1577 define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1578 ; X86-LABEL: test_mm256_maskstore_epi32:
1579 ; X86:       # %bb.0:
1580 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1581 ; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1582 ; X86-NEXT:    vzeroupper
1583 ; X86-NEXT:    retl
1584 ;
1585 ; X64-LABEL: test_mm256_maskstore_epi32:
1586 ; X64:       # %bb.0:
1587 ; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1588 ; X64-NEXT:    vzeroupper
1589 ; X64-NEXT:    retq
1590   %arg0 = bitcast float* %a0 to i8*
1591   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1592   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1593   call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1594   ret void
1595 }
1596 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1597
1598 define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1599 ; X86-LABEL: test_mm_maskstore_epi64:
1600 ; X86:       # %bb.0:
1601 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1602 ; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1603 ; X86-NEXT:    retl
1604 ;
1605 ; X64-LABEL: test_mm_maskstore_epi64:
1606 ; X64:       # %bb.0:
1607 ; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1608 ; X64-NEXT:    retq
1609   %arg0 = bitcast i64* %a0 to i8*
1610   call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1611   ret void
1612 }
1613 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1614
1615 define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1616 ; X86-LABEL: test_mm256_maskstore_epi64:
1617 ; X86:       # %bb.0:
1618 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1619 ; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1620 ; X86-NEXT:    vzeroupper
1621 ; X86-NEXT:    retl
1622 ;
1623 ; X64-LABEL: test_mm256_maskstore_epi64:
1624 ; X64:       # %bb.0:
1625 ; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1626 ; X64-NEXT:    vzeroupper
1627 ; X64-NEXT:    retq
1628   %arg0 = bitcast i64* %a0 to i8*
1629   call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1630   ret void
1631 }
1632 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1633
1634 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1635 ; CHECK-LABEL: test_mm256_max_epi8:
1636 ; CHECK:       # %bb.0:
1637 ; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1638 ; CHECK-NEXT:    ret{{[l|q]}}
1639   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1640   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1641   %cmp = icmp sgt <32 x i8> %arg0, %arg1
1642   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1643   %bc = bitcast <32 x i8> %sel to <4 x i64>
1644   ret <4 x i64> %bc
1645 }
1646
1647 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1648 ; CHECK-LABEL: test_mm256_max_epi16:
1649 ; CHECK:       # %bb.0:
1650 ; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1651 ; CHECK-NEXT:    ret{{[l|q]}}
1652   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1653   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1654   %cmp = icmp sgt <16 x i16> %arg0, %arg1
1655   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1656   %bc = bitcast <16 x i16> %sel to <4 x i64>
1657   ret <4 x i64> %bc
1658 }
1659
1660 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1661 ; CHECK-LABEL: test_mm256_max_epi32:
1662 ; CHECK:       # %bb.0:
1663 ; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1664 ; CHECK-NEXT:    ret{{[l|q]}}
1665   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1666   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1667   %cmp = icmp sgt <8 x i32> %arg0, %arg1
1668   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1669   %bc = bitcast <8 x i32> %sel to <4 x i64>
1670   ret <4 x i64> %bc
1671 }
1672
1673 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1674 ; CHECK-LABEL: test_mm256_max_epu8:
1675 ; CHECK:       # %bb.0:
1676 ; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1677 ; CHECK-NEXT:    ret{{[l|q]}}
1678   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1679   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1680   %cmp = icmp ugt <32 x i8> %arg0, %arg1
1681   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1682   %bc = bitcast <32 x i8> %sel to <4 x i64>
1683   ret <4 x i64> %bc
1684 }
1685
1686 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1687 ; CHECK-LABEL: test_mm256_max_epu16:
1688 ; CHECK:       # %bb.0:
1689 ; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1690 ; CHECK-NEXT:    ret{{[l|q]}}
1691   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1692   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1693   %cmp = icmp ugt <16 x i16> %arg0, %arg1
1694   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1695   %bc = bitcast <16 x i16> %sel to <4 x i64>
1696   ret <4 x i64> %bc
1697 }
1698
1699 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1700 ; CHECK-LABEL: test_mm256_max_epu32:
1701 ; CHECK:       # %bb.0:
1702 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1703 ; CHECK-NEXT:    ret{{[l|q]}}
1704   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1705   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1706   %cmp = icmp ugt <8 x i32> %arg0, %arg1
1707   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1708   %bc = bitcast <8 x i32> %sel to <4 x i64>
1709   ret <4 x i64> %bc
1710 }
1711
1712 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1713 ; CHECK-LABEL: test_mm256_min_epi8:
1714 ; CHECK:       # %bb.0:
1715 ; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1716 ; CHECK-NEXT:    ret{{[l|q]}}
1717   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1718   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1719   %cmp = icmp slt <32 x i8> %arg0, %arg1
1720   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1721   %bc = bitcast <32 x i8> %sel to <4 x i64>
1722   ret <4 x i64> %bc
1723 }
1724
1725 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1726 ; CHECK-LABEL: test_mm256_min_epi16:
1727 ; CHECK:       # %bb.0:
1728 ; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1729 ; CHECK-NEXT:    ret{{[l|q]}}
1730   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1731   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1732   %cmp = icmp slt <16 x i16> %arg0, %arg1
1733   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1734   %bc = bitcast <16 x i16> %sel to <4 x i64>
1735   ret <4 x i64> %bc
1736 }
1737
1738 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1739 ; CHECK-LABEL: test_mm256_min_epi32:
1740 ; CHECK:       # %bb.0:
1741 ; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1742 ; CHECK-NEXT:    ret{{[l|q]}}
1743   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1744   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1745   %cmp = icmp slt <8 x i32> %arg0, %arg1
1746   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1747   %bc = bitcast <8 x i32> %sel to <4 x i64>
1748   ret <4 x i64> %bc
1749 }
1750
1751 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1752 ; CHECK-LABEL: test_mm256_min_epu8:
1753 ; CHECK:       # %bb.0:
1754 ; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1755 ; CHECK-NEXT:    ret{{[l|q]}}
1756   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1757   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1758   %cmp = icmp ult <32 x i8> %arg0, %arg1
1759   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1760   %bc = bitcast <32 x i8> %sel to <4 x i64>
1761   ret <4 x i64> %bc
1762 }
1763
1764 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1765 ; CHECK-LABEL: test_mm256_min_epu16:
1766 ; CHECK:       # %bb.0:
1767 ; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1768 ; CHECK-NEXT:    ret{{[l|q]}}
1769   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1770   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1771   %cmp = icmp ult <16 x i16> %arg0, %arg1
1772   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1773   %bc = bitcast <16 x i16> %sel to <4 x i64>
1774   ret <4 x i64> %bc
1775 }
1776
1777 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1778 ; CHECK-LABEL: test_mm256_min_epu32:
1779 ; CHECK:       # %bb.0:
1780 ; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1781 ; CHECK-NEXT:    ret{{[l|q]}}
1782   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1783   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1784   %cmp = icmp ult <8 x i32> %arg0, %arg1
1785   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1786   %bc = bitcast <8 x i32> %sel to <4 x i64>
1787   ret <4 x i64> %bc
1788 }
1789
1790 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1791 ; CHECK-LABEL: test_mm256_movemask_epi8:
1792 ; CHECK:       # %bb.0:
1793 ; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1794 ; CHECK-NEXT:    vzeroupper
1795 ; CHECK-NEXT:    ret{{[l|q]}}
1796   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1797   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1798   ret i32 %res
1799 }
1800 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1801
1802 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1803 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1804 ; CHECK:       # %bb.0:
1805 ; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1806 ; CHECK-NEXT:    ret{{[l|q]}}
1807   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1808   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1809   %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1810   %bc = bitcast <16 x i16>  %call to <4 x i64>
1811   ret <4 x i64> %bc
1812 }
1813 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1814
1815 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1816 ; CHECK-LABEL: test_mm256_mul_epi32:
1817 ; CHECK:       # %bb.0:
1818 ; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1819 ; CHECK-NEXT:    ret{{[l|q]}}
1820   %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1821   %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1822   %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1823   %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1824   %res = mul nsw <4 x i64> %A1, %B1
1825   ret <4 x i64> %res
1826 }
1827 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1828
1829 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1830 ; CHECK-LABEL: test_mm256_mul_epu32:
1831 ; CHECK:       # %bb.0:
1832 ; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1833 ; CHECK-NEXT:    ret{{[l|q]}}
1834   %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1835   %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1836   %res = mul nuw <4 x i64> %A, %B
1837   ret <4 x i64> %res
1838 }
1839 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1840
1841 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1842 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1843 ; CHECK:       # %bb.0:
1844 ; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1845 ; CHECK-NEXT:    ret{{[l|q]}}
1846   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1847   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1848   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1849   %bc = bitcast <16 x i16> %res to <4 x i64>
1850   ret <4 x i64> %bc
1851 }
1852 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1853
1854 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1855 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1856 ; CHECK:       # %bb.0:
1857 ; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1858 ; CHECK-NEXT:    ret{{[l|q]}}
1859   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1860   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1861   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1862   %bc = bitcast <16 x i16> %res to <4 x i64>
1863   ret <4 x i64> %bc
1864 }
1865 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1866
1867 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1868 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1869 ; CHECK:       # %bb.0:
1870 ; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1871 ; CHECK-NEXT:    ret{{[l|q]}}
1872   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1873   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1874   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1875   %bc = bitcast <16 x i16> %res to <4 x i64>
1876   ret <4 x i64> %bc
1877 }
1878 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1879
1880 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1881 ; CHECK-LABEL: test_mm256_mullo_epi16:
1882 ; CHECK:       # %bb.0:
1883 ; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1884 ; CHECK-NEXT:    ret{{[l|q]}}
1885   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1886   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1887   %res = mul <16 x i16> %arg0, %arg1
1888   %bc = bitcast <16 x i16> %res to <4 x i64>
1889   ret <4 x i64> %bc
1890 }
1891
1892 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1893 ; CHECK-LABEL: test_mm256_mullo_epi32:
1894 ; CHECK:       # %bb.0:
1895 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1896 ; CHECK-NEXT:    ret{{[l|q]}}
1897   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1898   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1899   %res = mul <8 x i32> %arg0, %arg1
1900   %bc = bitcast <8 x i32> %res to <4 x i64>
1901   ret <4 x i64> %bc
1902 }
1903
1904 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1905 ; CHECK-LABEL: test_mm256_or_si256:
1906 ; CHECK:       # %bb.0:
1907 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1908 ; CHECK-NEXT:    ret{{[l|q]}}
1909   %res = or <4 x i64> %a0, %a1
1910   ret <4 x i64> %res
1911 }
1912
1913 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1914 ; CHECK-LABEL: test_mm256_packs_epi16:
1915 ; CHECK:       # %bb.0:
1916 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1917 ; CHECK-NEXT:    ret{{[l|q]}}
1918   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1919   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1920   %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1921   %res = bitcast <32 x i8> %call to <4 x i64>
1922   ret <4 x i64> %res
1923 }
1924 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1925
1926 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1927 ; CHECK-LABEL: test_mm256_packs_epi32:
1928 ; CHECK:       # %bb.0:
1929 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1930 ; CHECK-NEXT:    ret{{[l|q]}}
1931   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1932   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1933   %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1934   %res = bitcast <16 x i16> %call to <4 x i64>
1935   ret <4 x i64> %res
1936 }
1937 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1938
1939 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1940 ; CHECK-LABEL: test_mm256_packus_epi16:
1941 ; CHECK:       # %bb.0:
1942 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1943 ; CHECK-NEXT:    ret{{[l|q]}}
1944   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1945   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1946   %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1947   %res = bitcast <32 x i8> %call to <4 x i64>
1948   ret <4 x i64> %res
1949 }
1950 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1951
1952 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1953 ; CHECK-LABEL: test_mm256_packus_epi32:
1954 ; CHECK:       # %bb.0:
1955 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1956 ; CHECK-NEXT:    ret{{[l|q]}}
1957   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1958   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1959   %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1960   %res = bitcast <16 x i16> %call to <4 x i64>
1961   ret <4 x i64> %res
1962 }
1963 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1964
1965 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1966 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1967 ; CHECK:       # %bb.0:
1968 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1969 ; CHECK-NEXT:    ret{{[l|q]}}
1970   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1971   ret <4 x i64> %res
1972 }
1973 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1974
1975 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1976 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1977 ; CHECK:       # %bb.0:
1978 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1979 ; CHECK-NEXT:    ret{{[l|q]}}
1980   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1981   ret <4 x i64> %res
1982 }
1983
1984 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1985 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1986 ; CHECK:       # %bb.0:
1987 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1988 ; CHECK-NEXT:    ret{{[l|q]}}
1989   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1990   ret <4 x double> %res
1991 }
1992
1993 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1994 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1995 ; CHECK:       # %bb.0:
1996 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1997 ; CHECK-NEXT:    ret{{[l|q]}}
1998   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1999   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2000   %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
2001   %res = bitcast <8 x i32> %call to <4 x i64>
2002   ret <4 x i64> %res
2003 }
2004 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2005
2006 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2007 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2008 ; CHECK:       # %bb.0:
2009 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2010 ; CHECK-NEXT:    ret{{[l|q]}}
2011   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2012   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2013   ret <8 x float> %res
2014 }
2015 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2016
2017 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2018 ; CHECK-LABEL: test_mm256_sad_epu8:
2019 ; CHECK:       # %bb.0:
2020 ; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2021 ; CHECK-NEXT:    ret{{[l|q]}}
2022   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2023   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2024   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2025   ret <4 x i64> %res
2026 }
2027 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2028
2029 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2030 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2031 ; CHECK:       # %bb.0:
2032 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2033 ; CHECK-NEXT:    ret{{[l|q]}}
2034   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2035   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2036   %res = bitcast <8 x i32> %shuf to <4 x i64>
2037   ret <4 x i64> %res
2038 }
2039
2040 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2041 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2042 ; CHECK:       # %bb.0:
2043 ; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2044 ; CHECK-NEXT:    ret{{[l|q]}}
2045   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2046   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2047   %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2048   %res = bitcast <32 x i8> %shuf to <4 x i64>
2049   ret <4 x i64> %res
2050 }
2051 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2052
2053 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2054 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2055 ; CHECK:       # %bb.0:
2056 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2057 ; CHECK-NEXT:    ret{{[l|q]}}
2058   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2059   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2060   %res = bitcast <16 x i16> %shuf to <4 x i64>
2061   ret <4 x i64> %res
2062 }
2063
2064 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2065 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2066 ; CHECK:       # %bb.0:
2067 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2068 ; CHECK-NEXT:    ret{{[l|q]}}
2069   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2070   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2071   %res = bitcast <16 x i16> %shuf to <4 x i64>
2072   ret <4 x i64> %res
2073 }
2074
2075 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2076 ; CHECK-LABEL: test_mm256_sign_epi8:
2077 ; CHECK:       # %bb.0:
2078 ; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2079 ; CHECK-NEXT:    ret{{[l|q]}}
2080   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2081   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2082   %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2083   %res = bitcast <32 x i8> %call to <4 x i64>
2084   ret <4 x i64> %res
2085 }
2086 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2087
2088 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2089 ; CHECK-LABEL: test_mm256_sign_epi16:
2090 ; CHECK:       # %bb.0:
2091 ; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2092 ; CHECK-NEXT:    ret{{[l|q]}}
2093   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2094   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2095   %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2096   %res = bitcast <16 x i16> %call to <4 x i64>
2097   ret <4 x i64> %res
2098 }
2099 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2100
2101 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2102 ; CHECK-LABEL: test_mm256_sign_epi32:
2103 ; CHECK:       # %bb.0:
2104 ; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2105 ; CHECK-NEXT:    ret{{[l|q]}}
2106   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2107   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2108   %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2109   %res = bitcast <8 x i32> %call to <4 x i64>
2110   ret <4 x i64> %res
2111 }
2112 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2113
2114 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2115 ; CHECK-LABEL: test_mm256_sll_epi16:
2116 ; CHECK:       # %bb.0:
2117 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2118 ; CHECK-NEXT:    ret{{[l|q]}}
2119   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2120   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2121   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2122   %bc = bitcast <16 x i16> %res to <4 x i64>
2123   ret <4 x i64> %bc
2124 }
2125 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2126
2127 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2128 ; CHECK-LABEL: test_mm256_sll_epi32:
2129 ; CHECK:       # %bb.0:
2130 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2131 ; CHECK-NEXT:    ret{{[l|q]}}
2132   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2133   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2134   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2135   %bc = bitcast <8 x i32> %res to <4 x i64>
2136   ret <4 x i64> %bc
2137 }
2138 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2139
2140 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2141 ; CHECK-LABEL: test_mm256_sll_epi64:
2142 ; CHECK:       # %bb.0:
2143 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2144 ; CHECK-NEXT:    ret{{[l|q]}}
2145   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2146   ret <4 x i64> %res
2147 }
2148 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2149
2150 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2151 ; CHECK-LABEL: test_mm256_slli_epi16:
2152 ; CHECK:       # %bb.0:
2153 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2154 ; CHECK-NEXT:    ret{{[l|q]}}
2155   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2156   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2157   %bc = bitcast <16 x i16> %res to <4 x i64>
2158   ret <4 x i64> %bc
2159 }
2160 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2161
2162 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2163 ; CHECK-LABEL: test_mm256_slli_epi32:
2164 ; CHECK:       # %bb.0:
2165 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2166 ; CHECK-NEXT:    ret{{[l|q]}}
2167   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2168   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2169   %bc = bitcast <8 x i32> %res to <4 x i64>
2170   ret <4 x i64> %bc
2171 }
2172 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2173
2174 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2175 ; CHECK-LABEL: test_mm256_slli_epi64:
2176 ; CHECK:       # %bb.0:
2177 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2178 ; CHECK-NEXT:    ret{{[l|q]}}
2179   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2180   ret <4 x i64> %res
2181 }
2182 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2183
2184 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2185 ; CHECK-LABEL: test_mm256_slli_si256:
2186 ; CHECK:       # %bb.0:
2187 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2188 ; CHECK-NEXT:    ret{{[l|q]}}
2189   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2190   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2191   %res = bitcast <32 x i8> %shuf to <4 x i64>
2192   ret <4 x i64> %res
2193 }
2194
2195 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2196 ; CHECK-LABEL: test_mm_sllv_epi32:
2197 ; CHECK:       # %bb.0:
2198 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2199 ; CHECK-NEXT:    ret{{[l|q]}}
2200   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2201   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2202   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2203   %bc = bitcast <4 x i32> %res to <2 x i64>
2204   ret <2 x i64> %bc
2205 }
2206 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2207
2208 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2209 ; CHECK-LABEL: test_mm256_sllv_epi32:
2210 ; CHECK:       # %bb.0:
2211 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2212 ; CHECK-NEXT:    ret{{[l|q]}}
2213   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2214   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2215   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2216   %bc = bitcast <8 x i32> %res to <4 x i64>
2217   ret <4 x i64> %bc
2218 }
2219 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2220
2221 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2222 ; CHECK-LABEL: test_mm_sllv_epi64:
2223 ; CHECK:       # %bb.0:
2224 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2225 ; CHECK-NEXT:    ret{{[l|q]}}
2226   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2227   ret <2 x i64> %res
2228 }
2229 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2230
2231 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2232 ; CHECK-LABEL: test_mm256_sllv_epi64:
2233 ; CHECK:       # %bb.0:
2234 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2235 ; CHECK-NEXT:    ret{{[l|q]}}
2236   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2237   ret <4 x i64> %res
2238 }
2239 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2240
2241 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2242 ; CHECK-LABEL: test_mm256_sra_epi16:
2243 ; CHECK:       # %bb.0:
2244 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2245 ; CHECK-NEXT:    ret{{[l|q]}}
2246   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2247   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2248   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2249   %bc = bitcast <16 x i16> %res to <4 x i64>
2250   ret <4 x i64> %bc
2251 }
2252 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2253
2254 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2255 ; CHECK-LABEL: test_mm256_sra_epi32:
2256 ; CHECK:       # %bb.0:
2257 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2258 ; CHECK-NEXT:    ret{{[l|q]}}
2259   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2260   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2261   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2262   %bc = bitcast <8 x i32> %res to <4 x i64>
2263   ret <4 x i64> %bc
2264 }
2265 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2266
2267 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2268 ; CHECK-LABEL: test_mm256_srai_epi16:
2269 ; CHECK:       # %bb.0:
2270 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2271 ; CHECK-NEXT:    ret{{[l|q]}}
2272   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2273   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2274   %bc = bitcast <16 x i16> %res to <4 x i64>
2275   ret <4 x i64> %bc
2276 }
2277 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2278
2279 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2280 ; CHECK-LABEL: test_mm256_srai_epi32:
2281 ; CHECK:       # %bb.0:
2282 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2283 ; CHECK-NEXT:    ret{{[l|q]}}
2284   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2285   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2286   %bc = bitcast <8 x i32> %res to <4 x i64>
2287   ret <4 x i64> %bc
2288 }
2289 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2290
2291 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2292 ; CHECK-LABEL: test_mm_srav_epi32:
2293 ; CHECK:       # %bb.0:
2294 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2295 ; CHECK-NEXT:    ret{{[l|q]}}
2296   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2297   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2298   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2299   %bc = bitcast <4 x i32> %res to <2 x i64>
2300   ret <2 x i64> %bc
2301 }
2302 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2303
2304 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2305 ; CHECK-LABEL: test_mm256_srav_epi32:
2306 ; CHECK:       # %bb.0:
2307 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2308 ; CHECK-NEXT:    ret{{[l|q]}}
2309   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2310   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2311   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2312   %bc = bitcast <8 x i32> %res to <4 x i64>
2313   ret <4 x i64> %bc
2314 }
2315 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2316
2317 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2318 ; CHECK-LABEL: test_mm256_srl_epi16:
2319 ; CHECK:       # %bb.0:
2320 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2321 ; CHECK-NEXT:    ret{{[l|q]}}
2322   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2323   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2324   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2325   %bc = bitcast <16 x i16> %res to <4 x i64>
2326   ret <4 x i64> %bc
2327 }
2328 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2329
2330 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2331 ; CHECK-LABEL: test_mm256_srl_epi32:
2332 ; CHECK:       # %bb.0:
2333 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2334 ; CHECK-NEXT:    ret{{[l|q]}}
2335   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2336   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2337   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2338   %bc = bitcast <8 x i32> %res to <4 x i64>
2339   ret <4 x i64> %bc
2340 }
2341 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2342
2343 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2344 ; CHECK-LABEL: test_mm256_srl_epi64:
2345 ; CHECK:       # %bb.0:
2346 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2347 ; CHECK-NEXT:    ret{{[l|q]}}
2348   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2349   ret <4 x i64> %res
2350 }
2351 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2352
2353 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2354 ; CHECK-LABEL: test_mm256_srli_epi16:
2355 ; CHECK:       # %bb.0:
2356 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2357 ; CHECK-NEXT:    ret{{[l|q]}}
2358   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2359   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2360   %bc = bitcast <16 x i16> %res to <4 x i64>
2361   ret <4 x i64> %bc
2362 }
2363 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2364
2365 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2366 ; CHECK-LABEL: test_mm256_srli_epi32:
2367 ; CHECK:       # %bb.0:
2368 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2369 ; CHECK-NEXT:    ret{{[l|q]}}
2370   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2371   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2372   %bc = bitcast <8 x i32> %res to <4 x i64>
2373   ret <4 x i64> %bc
2374 }
2375 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2376
2377 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2378 ; CHECK-LABEL: test_mm256_srli_epi64:
2379 ; CHECK:       # %bb.0:
2380 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2381 ; CHECK-NEXT:    ret{{[l|q]}}
2382   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2383   ret <4 x i64> %res
2384 }
2385 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2386
2387 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2388 ; CHECK-LABEL: test_mm256_srli_si256:
2389 ; CHECK:       # %bb.0:
2390 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2391 ; CHECK-NEXT:    ret{{[l|q]}}
2392   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2393   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2394   %res = bitcast <32 x i8> %shuf to <4 x i64>
2395   ret <4 x i64> %res
2396 }
2397
2398 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2399 ; CHECK-LABEL: test_mm_srlv_epi32:
2400 ; CHECK:       # %bb.0:
2401 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2402 ; CHECK-NEXT:    ret{{[l|q]}}
2403   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2404   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2405   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2406   %bc = bitcast <4 x i32> %res to <2 x i64>
2407   ret <2 x i64> %bc
2408 }
2409 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2410
2411 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2412 ; CHECK-LABEL: test_mm256_srlv_epi32:
2413 ; CHECK:       # %bb.0:
2414 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2415 ; CHECK-NEXT:    ret{{[l|q]}}
2416   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2417   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2418   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2419   %bc = bitcast <8 x i32> %res to <4 x i64>
2420   ret <4 x i64> %bc
2421 }
2422 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2423
2424 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2425 ; CHECK-LABEL: test_mm_srlv_epi64:
2426 ; CHECK:       # %bb.0:
2427 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2428 ; CHECK-NEXT:    ret{{[l|q]}}
2429   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2430   ret <2 x i64> %res
2431 }
2432 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2433
2434 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2435 ; CHECK-LABEL: test_mm256_srlv_epi64:
2436 ; CHECK:       # %bb.0:
2437 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2438 ; CHECK-NEXT:    ret{{[l|q]}}
2439   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2440   ret <4 x i64> %res
2441 }
2442 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2443
2444 define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2445 ; X86-LABEL: test_mm256_stream_load_si256:
2446 ; X86:       # %bb.0:
2447 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2448 ; X86-NEXT:    vmovntdqa (%eax), %ymm0
2449 ; X86-NEXT:    retl
2450 ;
2451 ; X64-LABEL: test_mm256_stream_load_si256:
2452 ; X64:       # %bb.0:
2453 ; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2454 ; X64-NEXT:    retq
2455   %arg0 = bitcast <4 x i64> *%a0 to i8*
2456   %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2457   ret <4 x i64> %res
2458 }
2459 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2460
2461 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2462 ; CHECK-LABEL: test_mm256_sub_epi8:
2463 ; CHECK:       # %bb.0:
2464 ; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2465 ; CHECK-NEXT:    ret{{[l|q]}}
2466   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2467   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2468   %res = sub <32 x i8> %arg0, %arg1
2469   %bc = bitcast <32 x i8> %res to <4 x i64>
2470   ret <4 x i64> %bc
2471 }
2472
2473 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2474 ; CHECK-LABEL: test_mm256_sub_epi16:
2475 ; CHECK:       # %bb.0:
2476 ; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2477 ; CHECK-NEXT:    ret{{[l|q]}}
2478   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2479   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2480   %res = sub <16 x i16> %arg0, %arg1
2481   %bc = bitcast <16 x i16> %res to <4 x i64>
2482   ret <4 x i64> %bc
2483 }
2484
2485 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2486 ; CHECK-LABEL: test_mm256_sub_epi32:
2487 ; CHECK:       # %bb.0:
2488 ; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2489 ; CHECK-NEXT:    ret{{[l|q]}}
2490   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2491   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2492   %res = sub <8 x i32> %arg0, %arg1
2493   %bc = bitcast <8 x i32> %res to <4 x i64>
2494   ret <4 x i64> %bc
2495 }
2496
2497 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2498 ; CHECK-LABEL: test_mm256_sub_epi64:
2499 ; CHECK:       # %bb.0:
2500 ; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2501 ; CHECK-NEXT:    ret{{[l|q]}}
2502   %res = sub <4 x i64> %a0, %a1
2503   ret <4 x i64> %res
2504 }
2505
2506 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2507 ; CHECK-LABEL: test_mm256_subs_epi8:
2508 ; CHECK:       # %bb.0:
2509 ; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2510 ; CHECK-NEXT:    ret{{[l|q]}}
2511   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2512   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2513   %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2514   %bc = bitcast <32 x i8> %res to <4 x i64>
2515   ret <4 x i64> %bc
2516 }
2517 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2518
2519 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2520 ; CHECK-LABEL: test_mm256_subs_epi16:
2521 ; CHECK:       # %bb.0:
2522 ; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2523 ; CHECK-NEXT:    ret{{[l|q]}}
2524   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2525   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2526   %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2527   %bc = bitcast <16 x i16> %res to <4 x i64>
2528   ret <4 x i64> %bc
2529 }
2530 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2531
2532 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2533 ; CHECK-LABEL: test_mm256_subs_epu8:
2534 ; CHECK:       # %bb.0:
2535 ; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2536 ; CHECK-NEXT:    ret{{[l|q]}}
2537   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2538   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2539   %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2540   %bc = bitcast <32 x i8> %res to <4 x i64>
2541   ret <4 x i64> %bc
2542 }
2543 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2544
2545 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2546 ; CHECK-LABEL: test_mm256_subs_epu16:
2547 ; CHECK:       # %bb.0:
2548 ; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2549 ; CHECK-NEXT:    ret{{[l|q]}}
2550   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2551   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2552   %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2553   %bc = bitcast <16 x i16> %res to <4 x i64>
2554   ret <4 x i64> %bc
2555 }
2556 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2557
2558 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2559 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2560 ; CHECK:       # %bb.0:
2561 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2562 ; CHECK-NEXT:    ret{{[l|q]}}
2563   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2564   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2565   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2566   %bc = bitcast <32 x i8> %res to <4 x i64>
2567   ret <4 x i64> %bc
2568 }
2569
2570 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2571 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2572 ; CHECK:       # %bb.0:
2573 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2574 ; CHECK-NEXT:    ret{{[l|q]}}
2575   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2576   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2577   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2578   %bc = bitcast <16 x i16> %res to <4 x i64>
2579   ret <4 x i64> %bc
2580 }
2581
2582 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2583 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2584 ; CHECK:       # %bb.0:
2585 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2586 ; CHECK-NEXT:    ret{{[l|q]}}
2587   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2588   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2589   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2590   %bc = bitcast <8 x i32> %res to <4 x i64>
2591   ret <4 x i64> %bc
2592 }
2593
2594 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2595 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2596 ; CHECK:       # %bb.0:
2597 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2598 ; CHECK-NEXT:    ret{{[l|q]}}
2599   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2600   ret <4 x i64> %res
2601 }
2602
2603 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2604 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2605 ; CHECK:       # %bb.0:
2606 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2607 ; CHECK-NEXT:    ret{{[l|q]}}
2608   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2609   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2610   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2611   %bc = bitcast <32 x i8> %res to <4 x i64>
2612   ret <4 x i64> %bc
2613 }
2614
2615 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2616 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2617 ; CHECK:       # %bb.0:
2618 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2619 ; CHECK-NEXT:    ret{{[l|q]}}
2620   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2621   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2622   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2623   %bc = bitcast <16 x i16> %res to <4 x i64>
2624   ret <4 x i64> %bc
2625 }
2626
2627 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2628 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2629 ; CHECK:       # %bb.0:
2630 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2631 ; CHECK-NEXT:    ret{{[l|q]}}
2632   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2633   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2634   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2635   %bc = bitcast <8 x i32> %res to <4 x i64>
2636   ret <4 x i64> %bc
2637 }
2638
2639 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2640 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2641 ; CHECK:       # %bb.0:
2642 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2643 ; CHECK-NEXT:    ret{{[l|q]}}
2644   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2645   ret <4 x i64> %res
2646 }
2647
2648 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2649 ; CHECK-LABEL: test_mm256_xor_si256:
2650 ; CHECK:       # %bb.0:
2651 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2652 ; CHECK-NEXT:    ret{{[l|q]}}
2653   %res = xor <4 x i64> %a0, %a1
2654   ret <4 x i64> %res
2655 }
2656
2657 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2658
2659 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone